static int
stripe_reset(md_i_reset_t *mirp)
{
	minor_t		mnum = mirp->mnum;
	ms_unit_t	*un;
	mdi_unit_t	*ui;
	set_t		setno = MD_MIN2SET(mnum);

	mdclrerror(&mirp->mde);

	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits))
		return (mdmderror(&mirp->mde, MDE_INVAL_UNIT, mnum));

	if (md_get_setstatus(setno) & MD_SET_STALE)
		return (mdmddberror(&mirp->mde, MDE_DB_STALE, mnum, setno));

	un = MD_UNIT(mnum);
	if (un == NULL) {
		return (mdmderror(&mirp->mde, MDE_UNIT_NOT_SETUP, mnum));
	}

	/* This prevents new opens */
	rw_enter(&md_unit_array_rw.lock, RW_WRITER);

	if (MD_HAS_PARENT(un->c.un_parent)) {
		rw_exit(&md_unit_array_rw.lock);
		return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
	}

	/* single thread */
	ui = MDI_UNIT(mnum);
	un = md_unit_openclose_enter(ui);

	if (md_unit_isopen(ui)) {
		md_unit_openclose_exit(ui);
		rw_exit(&md_unit_array_rw.lock);
		return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
	}

	md_unit_openclose_exit(ui);
	reset_stripe(un, mnum, 1);

	/*
	 * Update unit availability
	 */
	md_set[setno].s_un_avail++;

	/*
	 * If MN set, reset s_un_next so all nodes can have
	 * the same view of the next available slot when
	 * nodes are -w and -j
	 */
	if (MD_MNSET_SETNO(setno)) {
		(void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
	}

	rw_exit(&md_unit_array_rw.lock);
	return (0);
}
/*
 * NAME:	raid_resync_unit
 *
 * DESCRIPTION: RAID metadevice specific resync routine.
 *		Open the unit and start resync_unit as a separate thread.
 *
 * PARAMETERS:	minor_t	  mnum - minor number identity of metadevice
 *		md_error_t *ep - output error parameter
 *
 * RETURN:	On error return 1 or set ep to nonzero, otherwise return 0.
 *
 * LOCKS:	Acquires and releases Unit Writer Lock.
 */
int
raid_resync_unit(
	minor_t			mnum,
	md_error_t		*ep
)
{
	mdi_unit_t	*ui;
	set_t		setno = MD_MIN2SET(mnum);
	mr_unit_t	*un;

	ui = MDI_UNIT(mnum);
	un = MD_UNIT(mnum);

	if (md_get_setstatus(setno) & MD_SET_STALE)
		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));

	ASSERT(un->un_column[un->un_resync_index].un_devflags &
	    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));

	/* Don't start a resync if the device is not available */
	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
	}

	if (raid_internal_open(mnum, FREAD | FWRITE, OTYP_LYR, 0)) {
		(void) md_unit_writerlock(ui);
		release_resync_request(mnum);
		md_unit_writerexit(ui);
		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
		    setno, MD_SID(un));
		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
	}

	/* start resync_unit thread */
	(void) thread_create(NULL, 0, resync_unit, (void *)(uintptr_t)mnum,
	    0, &p0, TS_RUN, minclsyspri);

	return (0);
}
Esempio n. 3
0
static int
set_hs(
	set_hs_params_t	*shs
)
{
	mdclrerror(&shs->mde);

	if (md_get_setstatus(shs->md_driver.md_setno) & MD_SET_STALE)
		return (mdmddberror(&shs->mde, MDE_DB_STALE, NODEV32,
		    shs->md_driver.md_setno));

	switch (shs->shs_cmd) {
	case ADD_HOT_SPARE:
		return (seths_add(shs));
	case DELETE_HOT_SPARE:
		return (seths_delete(shs));
	case REPLACE_HOT_SPARE:
		return (seths_replace(shs));
	case FIX_HOT_SPARE:
		return (seths_enable(shs));
	default:
		return (mderror(&shs->mde, MDE_INVAL_HSOP));
	}
}
static int
stripe_change(
	md_stripe_params_t	*msp,
	IOLOCK			*lock
)
{
	ms_params_t		*pp = &msp->params;
	minor_t			mnum = msp->mnum;
	ms_unit_t		*un;
	mdi_unit_t		*ui;
	int			r, c, i;
	struct ms_row		*mdr;
	ms_comp_t		*mdcomp, *mdc;
	mddb_recid_t		recids[4];
	int			irecid;
	int			inc_new_hsp = 0;
	int			err;
	set_t			setno = MD_MIN2SET(mnum);

	mdclrerror(&msp->mde);

	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits))
		return (mdmderror(&msp->mde, MDE_INVAL_UNIT, mnum));

	if (md_get_setstatus(setno) & MD_SET_STALE)
		return (mdmddberror(&msp->mde, MDE_DB_STALE, mnum, setno));

	if ((ui = MDI_UNIT(mnum)) == NULL) {
		return (mdmderror(&msp->mde, MDE_UNIT_NOT_SETUP, mnum));
	}

	if (!pp->change_hsp_id)
		return (0);

	un = (ms_unit_t *)md_ioctl_writerlock(lock, ui);

	/* verify that no hot spares are in use */
	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
	for (r = 0; r < un->un_nrows; r++) {
		mdr = &un->un_row[r];
		for (c = 0, i = mdr->un_icomp; c < mdr->un_ncomp; c++) {
			mdc = &mdcomp[i++];
			if (mdc->un_mirror.ms_hs_id != 0) {
				return (mdmderror(&msp->mde, MDE_HS_IN_USE,
				    mnum));
			}
		}
	}

	recids[1] = 0;
	recids[2] = 0;
	irecid = 1;
	if (pp->hsp_id != -1) {
		/* increment the reference count of the new hsp */
		err = md_hot_spare_ifc(HSP_INCREF, pp->hsp_id, 0, 0,
		    &recids[1], NULL, NULL, NULL);
		if (err) {
			return (mdhsperror(&msp->mde, MDE_INVAL_HSP,
			    pp->hsp_id));
		}
		inc_new_hsp = 1;
		irecid++;
	}

	if (un->un_hsp_id != -1) {
		/* decrement the reference count of the old hsp */
		err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
		    &recids[irecid], NULL, NULL, NULL);
		if (err) {
			err = mdhsperror(&msp->mde, MDE_INVAL_HSP,
			    pp->hsp_id);
			if (inc_new_hsp) {
				(void) md_hot_spare_ifc(HSP_DECREF,
				    pp->hsp_id, 0, 0,
				    &recids[1], NULL, NULL, NULL);
				/*
				 * Don't need to commit the record,
				 * cause it never got commit before
				 */
			}
			return (err);
		}
	}

	un->un_hsp_id = pp->hsp_id;

	recids[0] = un->c.un_record_id;
	recids[3] = 0;
	mddb_commitrecs_wrapper(recids);
	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
	    MD_UN2SET(un), MD_SID(un));

	return (0);
}
static int
stripe_grow(void *d, int mode, IOLOCK *lockp)
{
	minor_t		mnum;
	ms_unit_t	*un, *new_un;
	mdi_unit_t	*ui;
	minor_t		*par = NULL;
	IOLOCK		*plock = NULL;
	ms_comp_t	*mdcomp, *new_comp;
	int		row, i, c;
	mddb_recid_t	ms_recid;
	mddb_recid_t	old_vtoc = 0;
	mddb_recid_t	*recids;
	md_create_rec_option_t options;
	mddb_type_t	typ1;
	int		err;
	int64_t		tb, atb;
	uint_t		nr, oc;
	int		opened;
	int		rval = 0;
	set_t		setno;
	md_error_t	*mdep;
	int		npar;
	int		rid;
	int		num_recs;
	u_longlong_t	rev;
	md_grow_params_t	*mgp = d;


	mnum = mgp->mnum;
	mdep = &mgp->mde;
	setno = MD_MIN2SET(mnum);
	npar = mgp->npar;

	mdclrerror(mdep);

	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits))
		return (mdmderror(mdep, MDE_INVAL_UNIT, mnum));

	if (md_get_setstatus(setno) & MD_SET_STALE)
		return (mdmddberror(mdep, MDE_DB_STALE, mnum, setno));

	ui = MDI_UNIT(mnum);
	if (ui == NULL) {
		return (mdmderror(mdep, MDE_UNIT_NOT_SETUP, mnum));
	}

	if (npar >= 1) {
		ASSERT((minor_t *)(uintptr_t)mgp->par != NULL);
		par = kmem_alloc(npar * sizeof (*par), KM_SLEEP);
		plock = kmem_alloc(npar * sizeof (*plock), KM_SLEEP);
		if (ddi_copyin((caddr_t)(uintptr_t)mgp->par, (caddr_t)par,
		    (npar * sizeof (*par)), mode) != 0) {
			kmem_free(par, npar * sizeof (*par));
			kmem_free(plock, npar * sizeof (*plock));
			return (EFAULT);
		}
	}

	/*
	 * we grab unit reader/writer first, then parent locks,
	 * then our own.
	 * we expect parent units to be sorted to avoid deadlock
	 */
	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
	for (i = 0; i < npar; ++i) {
		(void) md_ioctl_writerlock(&plock[i],
		    MDI_UNIT(par[i]));
	}
	un = (ms_unit_t *)md_ioctl_writerlock(lockp, ui);

	if (un->un_nrows != mgp->nrows) {
		rval = EINVAL;
		goto out;
	}

	typ1 = (mddb_type_t)md_getshared_key(setno,
	    stripe_md_ops.md_driver.md_drivername);

	/*
	 * Preserve the friendly name nature of growing device.
	 */
	options = MD_CRO_STRIPE;
	if (un->c.un_revision & MD_FN_META_DEV)
		options |= MD_CRO_FN;
	if (mgp->options & MD_CRO_64BIT) {
#if defined(_ILP32)
		rval = mdmderror(mdep, MDE_UNIT_TOO_LARGE, mnum);
		goto out;
#else
		ms_recid = mddb_createrec((size_t)mgp->size, typ1, 0,
		    MD_CRO_64BIT | options, setno);
#endif
	} else {
		ms_recid = mddb_createrec((size_t)mgp->size, typ1, 0,
		    MD_CRO_32BIT | options, setno);
	}


	if (ms_recid < 0) {
		rval = mddbstatus2error(mdep, (int)ms_recid, mnum, setno);
		goto out;
	}

	/* get the address of the new unit */
	new_un = (ms_unit_t *)mddb_getrecaddr(ms_recid);

	/*
	 * It is okay that we muck with the new unit here,
	 * since no one else will know about the unit struct
	 * until we commit it. If we crash, the record will
	 * be automatically purged, since we haven't
	 * committed it yet and the old unit struct will be found.
	 */

	/* copy in the user's unit struct */
	err = ddi_copyin((caddr_t)(uintptr_t)mgp->mdp, (caddr_t)new_un,
	    (size_t)mgp->size, mode);
	if (err) {
		mddb_deleterec_wrapper(ms_recid);
		rval = EFAULT;
		goto out;
	}
	if (options & MD_CRO_FN)
		new_un->c.un_revision |= MD_FN_META_DEV;

	/*
	 * allocate the real recids array.  since we may have to
	 * commit underlying metadevice records, we need an
	 * array of size: total number of new components being
	 * attached + 2 (one for the stripe itself, one for the
	 * end marker).
	 */
	num_recs = 2;
	rid = 0;
	for (row = 0; row < new_un->un_nrows; row++) {
		struct ms_row *mdr = &new_un->un_row[row];
		num_recs += mdr->un_ncomp;
	}
	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
	recids[rid++] = ms_recid;

	/*
	 * Save a few of the new unit structs fields.
	 * Before they get clobbered.
	 */
	tb = new_un->c.un_total_blocks;
	atb = new_un->c.un_actual_tb;
	nr = new_un->un_nrows;
	oc = new_un->un_ocomp;
	rev = new_un->c.un_revision;

	/*
	 * Copy the old unit struct (static stuff)
	 * into new unit struct
	 */
	bcopy((caddr_t)un, (caddr_t)new_un,
	    sizeof (ms_unit_t) + ((nr - 2) * (sizeof (struct ms_row))));

	/*
	 * Restore the saved stuff.
	 */
	new_un->c.un_total_blocks = tb;
	md_nblocks_set(mnum, new_un->c.un_total_blocks);
	new_un->c.un_actual_tb = atb;
	new_un->un_nrows = nr;
	new_un->un_ocomp = oc;
	new_un->c.un_revision = rev;

	new_un->c.un_record_id = ms_recid;
	new_un->c.un_size = mgp->size;

	/* All 64 bit metadevices only support EFI labels. */
	if (mgp->options & MD_CRO_64BIT) {
		new_un->c.un_flag |= MD_EFILABEL;
		/*
		 * If the device was previously smaller than a terabyte,
		 * and had a vtoc record attached to it, we remove the
		 * vtoc record, because the layout has changed completely.
		 */
		if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
		    (un->c.un_vtoc_id != 0)) {
			old_vtoc = un->c.un_vtoc_id;
			new_un->c.un_vtoc_id =
			    md_vtoc_to_efi_record(old_vtoc, setno);
		}
	}

	/*
	 * Copy the old component structs into the new unit struct.
	 */
	mdcomp = (ms_comp_t *)((void *)&((char *)un)[un->un_ocomp]);
	new_comp = (ms_comp_t *)((void *)&((char *)new_un)[new_un->un_ocomp]);
	for (row = 0; row < un->un_nrows; row++) {
		struct ms_row *mdr = &un->un_row[row];
		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++, c++) {
			bcopy((caddr_t)&mdcomp[c], (caddr_t)&new_comp[c],
			    sizeof (ms_comp_t));
		}
	}

	opened = md_unit_isopen(ui);

	/*
	 * Set parent on metadevices being added.
	 * Open the new devices being added.
	 * NOTE: currently soft partitions are the only metadevices
	 * which can appear within a stripe.
	 */
	for (row = un->un_nrows; row < new_un->un_nrows; row++) {
		struct ms_row *mdr = &new_un->un_row[row];
		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
			struct ms_comp	*mdc = &new_comp[c++];
			md_dev64_t comp_dev;
			md_unit_t *comp_un;

			comp_dev = mdc->un_dev;
			/* set parent on any metadevices */
			if (md_getmajor(comp_dev) == md_major) {
				comp_un = MD_UNIT(md_getminor(comp_dev));
				recids[rid++] = MD_RECID(comp_un);
				md_set_parent(comp_dev, MD_SID(new_un));
			}

			if (opened) {
				md_dev64_t tmpdev = mdc->un_dev;
				/*
				 * Open by device id
				 * Check if this comp is hotspared and
				 * if it is then use the key for hotspare
				 */
				tmpdev = md_resolve_bydevid(mnum, tmpdev,
				    mdc->un_mirror.ms_hs_id ?
				    mdc->un_mirror.ms_hs_key : mdc->un_key);
				(void) md_layered_open(mnum, &tmpdev,
				    MD_OFLG_NULL);
				mdc->un_dev = tmpdev;
				mdc->un_mirror.ms_flags |= MDM_S_ISOPEN;
			}
		}
	}

	/* set end marker */
	recids[rid] = 0;
	/* commit new unit struct */
	mddb_commitrecs_wrapper(recids);

	/* delete old unit struct */
	mddb_deleterec_wrapper(un->c.un_record_id);

	/* place new unit in in-core array */
	md_nblocks_set(mnum, new_un->c.un_total_blocks);
	MD_UNIT(mnum) = new_un;

	/*
	 * If old_vtoc has a non zero value, we know:
	 * - This unit crossed the border from smaller to larger one TB
	 * - There was a vtoc record for the unit,
	 * - This vtoc record is no longer needed, because
	 *   a new efi record has been created for this un.
	 */
	if (old_vtoc != 0) {
		mddb_deleterec_wrapper(old_vtoc);
	}

	/* free recids array */
	kmem_free(recids, num_recs * sizeof (mddb_recid_t));

	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
	    MD_UN2SET(new_un), MD_SID(new_un));

	/* release locks, return success */
out:
	for (i =  npar - 1; (i >= 0); --i)
		md_ioctl_writerexit(&plock[i]);
	rw_exit(&md_unit_array_rw.lock);
	if (plock != NULL)
		kmem_free(plock, npar * sizeof (*plock));
	if (par != NULL)
		kmem_free(par, npar * sizeof (*par));
	return (rval);
}
static int
stripe_set(void *d, int mode)
{
	minor_t		mnum;
	ms_unit_t	*un;
	void		*p;
	mddb_recid_t	ms_recid;
	mddb_recid_t	*recids;
	mddb_type_t	typ1;
	int		err;
	set_t		setno;
	md_error_t	*mdep;
	struct ms_comp	*mdcomp;
	int		row;
	int		rid;
	int		num_recs;
	int		i, c;
	md_set_params_t	*msp = d;

	mnum = msp->mnum;
	setno = MD_MIN2SET(mnum);

	mdep = &msp->mde;

	mdclrerror(mdep);

	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
		return (mdmderror(mdep, MDE_INVAL_UNIT, mnum));
	}

	if (md_get_setstatus(setno) & MD_SET_STALE)
		return (mdmddberror(mdep, MDE_DB_STALE, mnum, setno));

	un = MD_UNIT(mnum);
	if (un != NULL) {
		return (mdmderror(mdep, MDE_UNIT_ALREADY_SETUP, mnum));
	}


	typ1 = (mddb_type_t)md_getshared_key(setno,
	    stripe_md_ops.md_driver.md_drivername);

	/* create the db record for this mdstruct */
	if (msp->options & MD_CRO_64BIT) {
#if defined(_ILP32)
		return (mdmderror(mdep, MDE_UNIT_TOO_LARGE, mnum));
#else
		ms_recid = mddb_createrec((size_t)msp->size, typ1, 0,
		    MD_CRO_64BIT | MD_CRO_STRIPE | MD_CRO_FN, setno);
#endif
	} else {
		ms_recid = mddb_createrec((size_t)msp->size, typ1, 0,
		    MD_CRO_32BIT | MD_CRO_STRIPE | MD_CRO_FN, setno);
	}
	if (ms_recid < 0)
		return (mddbstatus2error(mdep, ms_recid, mnum, setno));

	/* get the address of the mdstruct */
	p = (void *) mddb_getrecaddr(ms_recid);
	/*
	 * It is okay that we muck with the mdstruct here,
	 * since no one else will know about the mdstruct
	 * until we commit it. If we crash, the record will
	 * be automatically purged, since we haven't
	 * committed it yet.
	 */

	/* copy in the user's mdstruct */
	if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, (caddr_t)p,
	    (size_t)msp->size, mode)) {
		mddb_deleterec_wrapper(ms_recid);
		return (EFAULT);
	}

	un = (ms_unit_t *)p;

	/* All 64 bit metadevices only support EFI labels. */
	if (msp->options & MD_CRO_64BIT) {
		un->c.un_flag |= MD_EFILABEL;
	}

	/*
	 * allocate the real recids array.  since we may have to commit
	 * underlying metadevice records, we need an array
	 * of size: total number of components in stripe + 3
	 * (1 for the stripe itself, one for the hotspare, one
	 * for the end marker).
	 */
	num_recs = 3;
	rid = 0;
	for (row = 0; row < un->un_nrows; row++) {
		struct ms_row *mdr = &un->un_row[row];
		num_recs += mdr->un_ncomp;
	}
	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
	recids[rid++] = ms_recid;

	MD_SID(un) = mnum;
	MD_RECID(un) = recids[0];
	MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SUB_MIRROR | MD_CAN_SP;
	MD_PARENT(un) = MD_NO_PARENT;
	un->c.un_revision |= MD_FN_META_DEV;

	if (err = stripe_build_incore(p, 0)) {
		md_nblocks_set(mnum, -1ULL);
		MD_UNIT(mnum) = NULL;

		mddb_deleterec_wrapper(recids[0]);
		kmem_free(recids, num_recs * sizeof (mddb_recid_t));
		return (err);
	}

	/*
	 * Update unit availability
	 */
	md_set[setno].s_un_avail--;

	recids[rid] = 0;
	if (un->un_hsp_id != -1)
		err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
		    &recids[rid++], NULL, NULL, NULL);


	if (err) {
		md_nblocks_set(mnum, -1ULL);
		MD_UNIT(mnum) = NULL;

		mddb_deleterec_wrapper(recids[0]);
		kmem_free(recids, num_recs * sizeof (mddb_recid_t));
		return (mdhsperror(mdep, MDE_INVAL_HSP, un->un_hsp_id));
	}

	/*
	 * set the parent on any metadevice components.
	 * NOTE: currently soft partitions are the only metadevices
	 * which can appear within a stripe.
	 */
	mdcomp = (ms_comp_t *)((void *)&((char *)un)[un->un_ocomp]);
	for (row = 0; row < un->un_nrows; row++) {
		struct ms_row *mdr = &un->un_row[row];
		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
			ms_comp_t *mdc = &mdcomp[c++];
			md_dev64_t comp_dev;
			md_unit_t *comp_un;

			comp_dev = mdc->un_dev;
			if (md_getmajor(comp_dev) == md_major) {
				/* set parent and disallow soft partitioning */
				comp_un = MD_UNIT(md_getminor(comp_dev));
				recids[rid++] = MD_RECID(comp_un);
				md_set_parent(mdc->un_dev, MD_SID(un));
			}
		}
	}

	/* set end marker */
	recids[rid] = 0;
	mddb_commitrecs_wrapper(recids);

	md_create_unit_incore(mnum, &stripe_md_ops, 0);
	kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE,
	    MD_UN2SET(un), MD_SID(un));
	return (0);
}
Esempio n. 7
0
/*
 * FUNCTION:	meta_repartition_drive()
 * INPUT:	sp	- the set name for the device to check
 *		dnp	- the name of the drive to partition
 *              options - options (see NOTES)
 * OUTPUT:	vtocp	- pointer to an mdvtoc_t structure in which
 *			  to return the new VTOC to the caller
 *		ep	- pointer to an md_error_t structure in which
 *			  to return errors to the caller
 * RETURNS:	int	-  0 - drive was or can be repartitioned
 *			  -1 - drive could not or should not be
 *			       repartitioned
 * PURPOSE:	Repartition a disk for use in a disk set or in order
 *		to create soft partitions on it.  Alternatively,
 *		return the VTOC that the disk would have if it were
 *		repartitioned without actually repartitioning it.
 *
 * NOTES:
 *
 *     This routine will repartition a drive to make it suitable for
 *     inclusion in a diskset.  Specifically, it will create a
 *     proposed VTOC that specifies a replica slice that begins at the
 *     first valid lba, is large enough to hold a label and a metadb
 *     replica, does not overlap any other slices, and is unmountable.
 *     If the current replica slice already satisfies those criteria,
 *     the routine will neither create a proposed VTOC nor repartition
 *     the drive unless the MD_REPART_FORCE flag is passed into the
 *     routine in the options argument.  If the routine does create a
 *     proposed VTOC, it will return the proposed VTOC in *vtocp if
 *     vtocp isn't NULL.
 *
 *     The slice to be used as the replica slice is determined by the
 *     function meta_replicaslice().
 *
 *     If the replica slice does not satisfy the above criteria or the
 *     MD_REPART_FORCE flag is set, the proposed VTOC will specify a
 *     replica slice that satisfies the above criteria, a slice zero
 *     that contains the remaining space on the disk, and no other
 *     slices.  If that repartitioning would cause the replica slice
 *     to move or shrink, and the MD_REPART_LEAVE_REP option is set,
 *     the routine will return -1 without creating or returning a
 *     proposed vtoc, and without repartitioning the disk.  Otherwise
 *     the routine will repartition the disk unless the
 *     MD_REPART_DONT_LABEL flag is set in the options argument.
 *
 *     If the MD_REPART_DONT_LABEL flag is set in the options argument,
 *     but the routine would otherwise repartition the drive, the
 *     routine won't repartition the drive, but will create a proposed
 *     VTOC that satisfies the criteria defined above and return it
 *     it in *vtocp if vtocp isn't NULL,  The MD_REPART_DONT_LABEL
 *     option allows calling routines to determine what the contents of
 *     the drive's VTOC would be if the drive were repartitioned without
 *     actually repartitioning the drive.
 */
int
meta_repartition_drive(
	mdsetname_t	*sp,
	mddrivename_t	*dnp,
	int		options,
	mdvtoc_t	*vtocp,
	md_error_t	*ep
)
{
	uint_t			 replicaslice;
	diskaddr_t		 first_lba, last_lba;
	int			 round_sizes = 1;
	unsigned long long	 cylsize;
	unsigned long long	 drvsize;
	int			 i;
	mdgeom_t		*mdgp;
	mdvtoc_t		*mdvp;
	mdvtoc_t		 proposed_vtoc;
	uint_t			 reservedcyl;
	ushort_t		 resflag;
	mdname_t		*resnp;
	unsigned long long	 ressize;
	md_set_desc		*sd;
	daddr_t			 dbsize;
	diskaddr_t		 replica_start;
	diskaddr_t		 replica_size;
	diskaddr_t		 replica_end;
	diskaddr_t		 data_start;
	diskaddr_t		 data_size;

	if (meta_replicaslice(dnp, &replicaslice, ep) != 0) {
		return (-1);
	}

	/* Don't round for EFI disks */
	if (replicaslice == MD_SLICE6)
		round_sizes = 0;

	/*
	 * We took as argument a drive name pointer, but we need a
	 * slice name pointer to retrieve vtoc information.  So get
	 * the name pointer for slice zero first, then use it to get
	 * the vtoc info for the disk.
	 */
	if ((resnp = metaslicename(dnp, MD_SLICE0, ep)) == NULL)
		return (-1);

	if ((mdvp = metagetvtoc(resnp, FALSE, NULL, ep)) == NULL)
		return (-1);

	/*
	 * Determine the metadb size.
	 */
	dbsize = MD_DBSIZE;
	if (!metaislocalset(sp)) {
		if ((sd = metaget_setdesc(sp, ep)) == NULL)
			return (-1);

		if (MD_MNSET_DESC(sd))
			dbsize = MD_MN_DBSIZE;
	}

	/* If we've got an efi disk, we better have lba info */
	first_lba = mdvp->first_lba;
	last_lba = mdvp->last_lba;
	ASSERT((round_sizes != 0) || (last_lba > 0));

	/*
	 * At this point, ressize is used as a minimum value.  Later
	 * it will be rounded up to a cylinder boundary if
	 * appropriate.  ressize is in units of disk sectors.
	 */
	ressize = dbsize + VTOC_SIZE;
	resflag = V_UNMNT;

	/*
	 * If we're forcing the repartition, we can skip the replica
	 * slice and overlap tests.
	 */
	if (options & MD_REPART_FORCE) {
		goto do_repartition;
	}

	/*
	 * Replica slice tests: it must begin at first_lba, be long
	 * enough, have the right flags, and not overlap any other
	 * slices.  If any of these conditions is violated, we need to
	 * repartition the disk.
	 */
	if (mdvp->parts[replicaslice].start != first_lba) {
		goto do_repartition;
	}

	if (mdvp->parts[replicaslice].size < ressize) {
		goto do_repartition;
	}

	if (mdvp->parts[replicaslice].flag != resflag) {
		goto do_repartition;
	}

	/*
	 * Check for overlap: this test should use the actual size of
	 * the replica slice, as contained in the vtoc, and NOT the
	 * minimum size calculated above.
	 */
	replica_end = first_lba + mdvp->parts[replicaslice].size;
	for (i = 0; i < mdvp->nparts; i++) {
		if (i != replicaslice) {
			if ((mdvp->parts[i].size > 0) &&
			    (mdvp->parts[i].start < replica_end)) {
				goto do_repartition;
			}
		}
	}

	/*
	 * If we passed the above tests, then the disk is already
	 * partitioned appropriately, and we're not being told to
	 * force a change.
	 */
	return (0);

do_repartition:

	/* Retrieve disk geometry info and round to cylinder sizes */
	if (round_sizes != 0) {

		if ((mdgp = metagetgeom(resnp, ep)) == NULL)
			return (-1);

		/*
		 * Both cylsize and drvsize are in units of disk
		 * sectors.
		 *
		 * The intended results are of type unsigned long
		 * long.  Since each operand of the first
		 * multiplication is of type unsigned int, we risk
		 * overflow by multiplying and then converting the
		 * result.  Therefore we explicitly cast (at least)
		 * one of the operands, forcing conversion BEFORE
		 * multiplication, and avoiding overflow.  The second
		 * assignment is OK, since one of the operands is
		 * already of the desired type.
		 */
		cylsize =
		    ((unsigned long long)mdgp->nhead) * mdgp->nsect;
		drvsize = cylsize * mdgp->ncyl;

		/*
		 * How many cylinders must we reserve for the replica
		 * slice to ensure that it meets the previously
		 * calculated minimum size?
		 */
		reservedcyl = (ressize + cylsize - 1) / cylsize;
		ressize = reservedcyl * cylsize;
	} else {
		drvsize = last_lba - first_lba;
	}

	/* Would this require a forbidden change? */
	if (options & MD_REPART_LEAVE_REP) {
		if ((mdvp->parts[replicaslice].start != first_lba) ||
		    (mdvp->parts[replicaslice].size < ressize)) {
			return (mddeverror(ep, MDE_REPART_REPLICA,
			    resnp->dev, NULL));
		}
	}

	/*
	 * It seems unlikely that someone would pass us too small a
	 * disk, but it's still worth checking for...
	 */
	if (((round_sizes != 0) && (reservedcyl >= (int)mdgp->ncyl)) ||
	    ((round_sizes == 0) && (ressize + first_lba >= last_lba))) {
		return (mdmddberror(ep, MDE_DB_TOOSMALL,
		    meta_getminor(resnp->dev), sp->setno, 0, NULL));
	}

	replica_start = first_lba;
	replica_size = ressize;
	data_start = first_lba + ressize;
	data_size = drvsize - ressize;

	/*
	 * Create the proposed VTOC.  First copy the current VTOC
	 * into the proposed VTOC to duplicate the values that don't
	 * need to change.  Then change the partition table and set
	 * the flag value for the replica slice to resflag to reserve it
	 * for metadata.
	 */
	proposed_vtoc = *mdvp;
	/* We need at least replicaslice partitions in the proposed vtoc */
	if (replicaslice >= proposed_vtoc.nparts) {
		proposed_vtoc.nparts = replicaslice + 1;
	}
	for (i = 0; i < proposed_vtoc.nparts; i++) {
		/* don't change the reserved partition of an EFI device */
		if (proposed_vtoc.parts[i].tag == V_RESERVED)
			data_size = proposed_vtoc.parts[i].start - data_start;
		else
			(void) memset(&proposed_vtoc.parts[i], '\0',
				sizeof (proposed_vtoc.parts[i]));
	}

	proposed_vtoc.parts[MD_SLICE0].start = data_start;
	proposed_vtoc.parts[MD_SLICE0].size = data_size;
	proposed_vtoc.parts[MD_SLICE0].tag = V_USR;
	proposed_vtoc.parts[replicaslice].start = replica_start;
	proposed_vtoc.parts[replicaslice].size = replica_size;
	proposed_vtoc.parts[replicaslice].flag = resflag;
	proposed_vtoc.parts[replicaslice].tag = V_USR;

	if (!(options & MD_REPART_DONT_LABEL)) {
		/*
		 * Label the disk with the proposed VTOC.
		 */
		*mdvp = proposed_vtoc;
		if (metasetvtoc(resnp, ep) != 0) {
			return (-1);
		}
	}

	if (vtocp != NULL) {
		/*
		 * Return the proposed VTOC.
		 */
		*vtocp = proposed_vtoc;
	}

	return (0);
}