Ejemplo n.º 1
0
void
moveobject(struct vinum_ioctl_msg *msg)
{
    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
    struct drive *drive;
    struct sd *sd;

    /* Check that our objects are valid (i.e. they exist) */
    drive = validdrive(msg->index, (struct _ioctl_reply *) msg);
    if (drive == NULL)
	return;
    sd = validsd(msg->otherobject, (struct _ioctl_reply *) msg);
    if (sd == NULL)
	return;
    if (sd->driveno == msg->index)			    /* sd already belongs to drive */
	return;

    if (sd->state > sd_stale)
	set_sd_state(sd->sdno, sd_stale, setstate_force);   /* make the subdisk stale */
    else
	sd->state = sd_empty;
    if (sd->plexno >= 0)				    /* part of a plex, */
	update_plex_state(sd->plexno);			    /* update its state */

    /* Return the space on the old drive */
    if ((sd->driveno >= 0)				    /* we have a drive, */
    &&(sd->sectors > 0))				    /* and some space on it */
	return_drive_space(sd->driveno,			    /* return the space */
	    sd->driveoffset,
	    sd->sectors);

    /* Reassign the old subdisk */
    sd->driveno = msg->index;
    sd->driveoffset = -1;				    /* let the drive decide where to put us */
    give_sd_to_drive(sd->sdno);
    reply->error = 0;
}
Ejemplo n.º 2
0
/*
 * define the low-level requests needed to perform
 * a high-level I/O operation for a specific plex
 * 'plexno'.
 *
 * Return 0 if all subdisks involved in the
 * request are up, 1 if some subdisks are not up,
 * and -1 if the request is at least partially
 * outside the bounds of the subdisks.
 *
 * Modify the pointer *diskstart to point to the
 * end address.  On read, return on the first bad
 * subdisk, so that the caller
 * (build_read_request) can try alternatives.
 *
 * On entry to this routine, the prq structures
 * are not assigned.  The assignment is performed
 * by expandrq().  Strictly speaking, the elements
 * rqe->sdno of all entries should be set to -1,
 * since 0 (from bzero) is a valid subdisk number.
 * We avoid this problem by initializing the ones
 * we use, and not looking at the others (index >=
 * prq->requests).
 */
enum requeststatus
bre5(struct request *rq,
    int plexno,
    daddr_t * diskaddr,
    daddr_t diskend)
{
    struct metrics m;					    /* most of the information */
    struct sd *sd;
    struct plex *plex;
    struct buf *bp;					    /* user's bp */
    struct rqgroup *rqg;				    /* the request group that we will create */
    struct rqelement *rqe;				    /* point to this request information */
    int rsectors;					    /* sectors remaining in this stripe */
    int mysdno;						    /* another sd index in loops */
    int rqno;						    /* request number */

    rqg = NULL;						    /* shut up, damn compiler */
    m.diskstart = *diskaddr;				    /* start of transfer */
    bp = rq->bp;					    /* buffer pointer */
    plex = &PLEX[plexno];				    /* point to the plex */


    while (*diskaddr < diskend) {			    /* until we get it all sorted out */
	if (*diskaddr >= plex->length)			    /* beyond the end of the plex */
	    return REQUEST_EOF;				    /* can't continue */

	m.badsdno = -1;					    /* no bad subdisk yet */

	/* Part A: Define the request */
	/*
	 * First, calculate some sizes:
	 * The offset of the start address from
	 * the start of the stripe.
	 */
	m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));

	/*
	 * The plex-relative address of the
	 * start of the stripe.
	 */
	m.stripebase = *diskaddr - m.stripeoffset;

	/* subdisk containing the parity stripe */
	if (plex->organization == plex_raid5)
	    m.psdno = plex->subdisks - 1
		- (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
		% plex->subdisks;
	else						    /* RAID-4 */
	    m.psdno = plex->subdisks - 1;

	/*
	 * The number of the subdisk in which
	 * the start is located.
	 */
	m.firstsdno = m.stripeoffset / plex->stripesize;
	if (m.firstsdno >= m.psdno)			    /* at or past parity sd */
	    m.firstsdno++;				    /* increment it */

	/*
	 * The offset from the beginning of
	 * the stripe on this subdisk.
	 */
	m.initoffset = m.stripeoffset % plex->stripesize;

	/* The offset of the stripe start relative to this subdisk */
	m.sdbase = m.stripebase / (plex->subdisks - 1);

	m.useroffset = *diskaddr - m.diskstart;		    /* The offset of the start in the user buffer */

	/*
	 * The number of sectors to transfer in the
	 * current (first) subdisk.
	 */
	m.initlen = min(diskend - *diskaddr,		    /* the amount remaining to transfer */
	    plex->stripesize - m.initoffset);		    /* and the amount left in this block */

	/*
	 * The number of sectors to transfer in this stripe
	 * is the minumum of the amount remaining to transfer
	 * and the amount left in this stripe.
	 */
	m.stripesectors = min(diskend - *diskaddr,
	    plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);

	/* The number of data subdisks involved in this request */
	m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;

	/* Part B: decide what kind of transfer this will be.

	 * start and end addresses of the transfer in
	 * the current block.
	 *
	 * There are a number of different kinds of
	 * transfer, each of which relates to a
	 * specific subdisk:
	 *
	 * 1. Normal read.  All participating subdisks
	 *    are up, and the transfer can be made
	 *    directly to the user buffer.  The bounds
	 *    of the transfer are described by
	 *    m.dataoffset and m.datalen.  We have
	 *    already calculated m.initoffset and
	 *    m.initlen, which define the parameters
	 *    for the first data block.
	 *
	 * 2. Recovery read.  One participating
	 *    subdisk is down.  To recover data, all
	 *    the other subdisks, including the parity
	 *    subdisk, must be read.  The data is
	 *    recovered by exclusive-oring all the
	 *    other blocks.  The bounds of the
	 *    transfer are described by m.groupoffset
	 *    and m.grouplen.
	 *
	 * 3. A read request may request reading both
	 *    available data (normal read) and
	 *    non-available data (recovery read).
	 *    This can be a problem if the address
	 *    ranges of the two reads do not coincide:
	 *    in this case, the normal read needs to
	 *    be extended to cover the address range
	 *    of the recovery read, and must thus be
	 *    performed out of malloced memory.
	 *
	 * 4. Normal write.  All the participating
	 *    subdisks are up.  The bounds of the
	 *    transfer are described by m.dataoffset
	 *    and m.datalen.  Since these values
	 *    differ for each block, we calculate the
	 *    bounds for the parity block
	 *    independently as the maximum of the
	 *    individual blocks and store these values
	 *    in m.writeoffset and m.writelen.  This
	 *    write proceeds in four phases:
	 *
	 *    i.  Read the old contents of each block
	 *        and the parity block.
	 *    ii.  ``Remove'' the old contents from
	 *         the parity block with exclusive or.
	 *    iii. ``Insert'' the new contents of the
	 *          block in the parity block, again
	 *          with exclusive or.
	 *
	 *    iv.  Write the new contents of the data
	 *         blocks and the parity block.  The data
	 *         block transfers can be made directly from
	 *         the user buffer.
	 *
	 * 5. Degraded write where the data block is
	 *    not available.  The bounds of the
	 *    transfer are described by m.groupoffset
	 *    and m.grouplen. This requires the
	 *    following steps:
	 *
	 *    i.  Read in all the other data blocks,
	 *        excluding the parity block.
	 *
	 *    ii.  Recreate the parity block from the
	 *         other data blocks and the data to be
	 *         written.
	 *
	 *    iii. Write the parity block.
	 *
	 * 6. Parityless write, a write where the
	 *    parity block is not available.  This is
	 *    in fact the simplest: just write the
	 *    data blocks.  This can proceed directly
	 *    from the user buffer.  The bounds of the
	 *    transfer are described by m.dataoffset
	 *    and m.datalen.
	 *
	 * 7. Combination of degraded data block write
	 *    and normal write.  In this case the
	 *    address ranges of the reads may also
	 *    need to be extended to cover all
	 *    participating blocks.
	 *
	 * All requests in a group transfer transfer
	 * the same address range relative to their
	 * subdisk.  The individual transfers may
	 * vary, but since our group of requests is
	 * all in a single slice, we can define a
	 * range in which they all fall.
	 *
	 * In the following code section, we determine
	 * which kind of transfer we will perform.  If
	 * there is a group transfer, we also decide
	 * its bounds relative to the subdisks.  At
	 * the end, we have the following values:
	 *
	 *  m.flags indicates the kinds of transfers
	 *    we will perform.
	 *  m.initoffset indicates the offset of the
	 *    beginning of any data operation relative
	 *    to the beginning of the stripe base.
	 *  m.initlen specifies the length of any data
	 *    operation.
	 *  m.dataoffset contains the same value as
	 *    m.initoffset.
	 *  m.datalen contains the same value as
	 *    m.initlen.  Initially dataoffset and
	 *    datalen describe the parameters for the
	 *    first data block; while building the data
	 *    block requests, they are updated for each
	 *    block.
	 *  m.groupoffset indicates the offset of any
	 *    group operation relative to the beginning
	 *    of the stripe base.
	 *  m.grouplen specifies the length of any
	 *    group operation.
	 *  m.writeoffset indicates the offset of a
	 *    normal write relative to the beginning of
	 *    the stripe base.  This value differs from
	 *    m.dataoffset in that it applies to the
	 *    entire operation, and not just the first
	 *    block.
	 *  m.writelen specifies the total span of a
	 *    normal write operation.  writeoffset and
	 *    writelen are used to define the parity
	 *    block.
	 */
	m.groupoffset = 0;				    /* assume no group... */
	m.grouplen = 0;					    /* until we know we have one */
	m.writeoffset = m.initoffset;			    /* start offset of transfer */
	m.writelen = 0;					    /* nothing to write yet */
	m.flags = 0;					    /* no flags yet */
	rsectors = m.stripesectors;			    /* remaining sectors to examine */
	m.dataoffset = m.initoffset;			    /* start at the beginning of the transfer */
	m.datalen = m.initlen;

	if (m.sdcount > 1) {
	    plex->multiblock++;				    /* more than one block for the request */
	    /*
	     * If we have two transfers that don't overlap,
	     * (one at the end of the first block, the other
	     * at the beginning of the second block),
	     * it's cheaper to split them.
	     */
	    if (rsectors < plex->stripesize) {
		m.sdcount = 1;				    /* just one subdisk */
		m.stripesectors = m.initlen;		    /* and just this many sectors */
		rsectors = m.initlen;			    /* and in the loop counter */
	    }
	}
	if (SD[plex->sdnos[m.psdno]].state < sd_reborn)	    /* is our parity subdisk down? */
	    m.badsdno = m.psdno;			    /* note that it's down */
	if (bp->b_flags & B_READ) {			    /* read operation */
	    for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
		if (mysdno == m.psdno)			    /* ignore parity on read */
		    mysdno++;
		if (mysdno == plex->subdisks)		    /* wraparound */
		    mysdno = 0;
		if (mysdno == m.psdno)			    /* parity, */
		    mysdno++;				    /* we've given already */

		if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
		    if (m.badsdno >= 0)			    /* we had one already, */
			return REQUEST_DOWN;		    /* we can't take a second */
		    m.badsdno = mysdno;			    /* got the first */
		    m.groupoffset = m.dataoffset;	    /* define the bounds */
		    m.grouplen = m.datalen;
		    m.flags |= XFR_RECOVERY_READ;	    /* we need recovery */
		    plex->recovered_reads++;		    /* count another one */
		} else
		    m.flags |= XFR_NORMAL_READ;		    /* normal read */

		/* Update the pointers for the next block */
		m.dataoffset = 0;			    /* back to the start of the stripe */
		rsectors -= m.datalen;			    /* remaining sectors to examine */
		m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
	    }
	} else {					    /* write operation */
	    for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
		if (mysdno == m.psdno)			    /* parity stripe, we've dealt with that */
		    mysdno++;
		if (mysdno == plex->subdisks)		    /* wraparound */
		    mysdno = 0;
		if (mysdno == m.psdno)			    /* parity, */
		    mysdno++;				    /* we've given already */

		sd = &SD[plex->sdnos[mysdno]];
		if (sd->state != sd_up) {
		    enum requeststatus s;

		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
		    if (s && (m.badsdno >= 0)) {	    /* second bad disk, */
			int sdno;
			/*
			 * If the parity disk is down, there's
			 * no recovery.  We make all involved
			 * subdisks stale.  Otherwise, we
			 * should be able to recover, but it's
			 * like pulling teeth.  Fix it later.
			 */
			for (sdno = 0; sdno < m.sdcount; sdno++) {
			    struct sd *sd = &SD[plex->sdnos[sdno]];
			    if (sd->state >= sd_reborn)	    /* sort of up, */
				set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
			}
			return s;			    /* and crap out */
		    }
		    m.badsdno = mysdno;			    /* note which one is bad */
		    m.flags |= XFR_DEGRADED_WRITE;	    /* we need recovery */
		    plex->degraded_writes++;		    /* count another one */
		    m.groupoffset = m.dataoffset;	    /* define the bounds */
		    m.grouplen = m.datalen;
		} else {
		    m.flags |= XFR_NORMAL_WRITE;	    /* normal write operation */
		    if (m.writeoffset > m.dataoffset) {	    /* move write operation lower */
			m.writelen = max(m.writeoffset + m.writelen,
			    m.dataoffset + m.datalen)
			    - m.dataoffset;
			m.writeoffset = m.dataoffset;
		    } else
			m.writelen = max(m.writeoffset + m.writelen,
			    m.dataoffset + m.datalen)
			    - m.writeoffset;
		}

		/* Update the pointers for the next block */
		m.dataoffset = 0;			    /* back to the start of the stripe */
		rsectors -= m.datalen;			    /* remaining sectors to examine */
		m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
	    }
	    if (m.badsdno == m.psdno) {			    /* got a bad parity block, */
		struct sd *psd = &SD[plex->sdnos[m.psdno]];

		if (psd->state == sd_down)
		    set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
		else if (psd->state == sd_crashed)
		    set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
		m.flags &= ~XFR_NORMAL_WRITE;		    /* this write isn't normal, */
		m.flags |= XFR_PARITYLESS_WRITE;	    /* it's parityless */
		plex->parityless_writes++;		    /* count another one */
	    }
	}

	/* reset the initial transfer values */
	m.dataoffset = m.initoffset;			    /* start at the beginning of the transfer */
	m.datalen = m.initlen;

	/* decide how many requests we need */
	if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
	    /* doing a recovery read or degraded write, */
	    m.rqcount = plex->subdisks;			    /* all subdisks */
	else if (m.flags & XFR_NORMAL_WRITE)		    /* normal write, */
	    m.rqcount = m.sdcount + 1;			    /* all data blocks and the parity block */
	else						    /* parityless write or normal read */
	    m.rqcount = m.sdcount;			    /* just the data blocks */

	/* Part C: build the requests */
	rqg = allocrqg(rq, m.rqcount);			    /* get a request group */
	if (rqg == NULL) {				    /* malloc failed */
	    bp->b_flags |= B_ERROR;
	    bp->b_error = ENOMEM;
	    biodone(bp);
	    return REQUEST_ENOMEM;
	}
	rqg->plexno = plexno;
	rqg->flags = m.flags;
	rqno = 0;					    /* index in the request group */

	/* 1: PARITY BLOCK */
	/*
	 * Are we performing an operation which requires parity?  In that case,
	 * work out the parameters and define the parity block.
	 * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
	 */
	if (m.flags & XFR_PARITYOP) {			    /* need parity */
	    rqe = &rqg->rqe[rqno];			    /* point to element */
	    sd = &SD[plex->sdnos[m.psdno]];		    /* the subdisk in question */
	    rqe->rqg = rqg;				    /* point back to group */
	    rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
	    &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE);	    /* transfer flags without data op stuf */
	    setrqebounds(rqe, &m);			    /* set up the bounds of the transfer */
	    rqe->sdno = sd->sdno;			    /* subdisk number */
	    rqe->driveno = sd->driveno;
	    if (build_rq_buffer(rqe, plex))		    /* build the buffer */
		return REQUEST_ENOMEM;			    /* can't do it */
	    rqe->b.b_flags |= B_READ;			    /* we must read first */
	    m.sdcount++;				    /* adjust the subdisk count */
	    rqno++;					    /* and point to the next request */
	}
	/*
	 * 2: DATA BLOCKS
	 * Now build up requests for the blocks required
	 * for individual transfers
	 */
	for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
	    if (mysdno == m.psdno)			    /* parity, */
		mysdno++;				    /* we've given already */
	    if (mysdno == plex->subdisks)		    /* got to the end, */
		mysdno = 0;				    /* wrap around */
	    if (mysdno == m.psdno)			    /* parity, */
		mysdno++;				    /* we've given already */

	    rqe = &rqg->rqe[rqno];			    /* point to element */
	    sd = &SD[plex->sdnos[mysdno]];		    /* the subdisk in question */
	    rqe->rqg = rqg;				    /* point to group */
	    if (m.flags & XFR_NEEDS_MALLOC)		    /* we need a malloced buffer first */
		rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
	    else
		rqe->flags = m.flags | XFR_DATA_BLOCK;	    /* transfer flags */
	    if (mysdno == m.badsdno) {			    /* this is the bad subdisk */
		rqg->badsdno = rqno;			    /* note which one */
		rqe->flags |= XFR_BAD_SUBDISK;		    /* note that it's dead */
		/*
		 * we can't read or write from/to it,
		 * but we don't need to malloc
		 */
		rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
	    }
	    setrqebounds(rqe, &m);			    /* set up the bounds of the transfer */
	    rqe->useroffset = m.useroffset;		    /* offset in user buffer */
	    rqe->sdno = sd->sdno;			    /* subdisk number */
	    rqe->driveno = sd->driveno;
	    if (build_rq_buffer(rqe, plex))		    /* build the buffer */
		return REQUEST_ENOMEM;			    /* can't do it */
	    if ((m.flags & XFR_PARITYOP)		    /* parity operation, */
	    &&((m.flags & XFR_BAD_SUBDISK) == 0))	    /* and not the bad subdisk, */
		rqe->b.b_flags |= B_READ;		    /* we must read first */

	    /* Now update pointers for the next block */
	    *diskaddr += m.datalen;			    /* skip past what we've done */
	    m.stripesectors -= m.datalen;		    /* deduct from what's left */
	    m.useroffset += m.datalen;			    /* and move on in the user buffer */
	    m.datalen = min(m.stripesectors, plex->stripesize);	/* and recalculate */
	    m.dataoffset = 0;				    /* start at the beginning of next block */
	}

	/*
	 * 3: REMAINING BLOCKS FOR RECOVERY
	 * Finally, if we have a recovery operation, build
	 * up transfers for the other subdisks.  Follow the
	 * subdisks around until we get to where we started.
	 * These requests use only the group parameters.
	 */
	if ((rqno < m.rqcount)				    /* haven't done them all already */
	&&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
	    for (; rqno < m.rqcount; rqno++, mysdno++) {
		if (mysdno == m.psdno)			    /* parity, */
		    mysdno++;				    /* we've given already */
		if (mysdno == plex->subdisks)		    /* got to the end, */
		    mysdno = 0;				    /* wrap around */
		if (mysdno == m.psdno)			    /* parity, */
		    mysdno++;				    /* we've given already */

		rqe = &rqg->rqe[rqno];			    /* point to element */
		sd = &SD[plex->sdnos[mysdno]];		    /* the subdisk in question */
		rqe->rqg = rqg;				    /* point to group */

		rqe->sdoffset = m.sdbase + m.groupoffset;   /* start of transfer */
		rqe->dataoffset = 0;			    /* for tidiness' sake */
		rqe->groupoffset = 0;			    /* group starts at the beginining */
		rqe->datalen = 0;
		rqe->grouplen = m.grouplen;
		rqe->buflen = m.grouplen;
		rqe->flags = (m.flags | XFR_MALLOCED)	    /* transfer flags without data op stuf */
		&~XFR_DATAOP;
		rqe->sdno = sd->sdno;			    /* subdisk number */
		rqe->driveno = sd->driveno;
		if (build_rq_buffer(rqe, plex))		    /* build the buffer */
		    return REQUEST_ENOMEM;		    /* can't do it */
		rqe->b.b_flags |= B_READ;		    /* we must read first */
	    }
	}
	/*
	 * We need to lock the address range before
	 * doing anything.  We don't have to be
	 * performing a recovery operation: somebody
	 * else could be doing so, and the results could
	 * influence us.  Note the fact here, we'll perform
	 * the lock in launch_requests.
	 */
	rqg->lockbase = m.stripebase;
	if (*diskaddr < diskend)			    /* didn't finish the request on this stripe */
	    plex->multistripe++;			    /* count another one */
    }
    return REQUEST_OK;
}
/*
 * Take a completed buffer, transfer the data back if
 * it's a read, and complete the high-level request
 * if this is the last subrequest.
 *
 * The bp parameter is in fact a struct rqelement, which
 * includes a couple of extras at the end.
 */
void
complete_rqe(struct bio *bio)
{
    union daemoninfo di;
    struct buf *bp = bio->bio_buf;
    struct rqelement *rqe;
    struct request *rq;
    struct rqgroup *rqg;
    struct bio *ubio;					    /* user buffer */
    struct drive *drive;
    struct sd *sd;
    char *gravity;					    /* for error messages */

    get_mplock();

    rqe = (struct rqelement *) bp;			    /* point to the element that completed */
    rqg = rqe->rqg;					    /* and the request group */
    rq = rqg->rq;					    /* and the complete request */
    ubio = rq->bio;					    /* user buffer */

#ifdef VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_iodone, (union rqinfou) rqe, ubio);
#endif
    drive = &DRIVE[rqe->driveno];
    drive->active--;					    /* one less outstanding I/O on this drive */
    vinum_conf.active--;				    /* one less outstanding I/O globally */
    if ((drive->active == (DRIVE_MAXACTIVE - 1))	    /* we were at the drive limit */
    ||(vinum_conf.active == VINUM_MAXACTIVE))		    /* or the global limit */
	wakeup(&launch_requests);			    /* let another one at it */
    if ((bp->b_flags & B_ERROR) != 0) {			    /* transfer in error */
	gravity = "";
	sd = &SD[rqe->sdno];

	if (bp->b_error != 0)				    /* did it return a number? */
	    rq->error = bp->b_error;			    /* yes, put it in. */
	else if (rq->error == 0)			    /* no: do we have one already? */
	    rq->error = EIO;				    /* no: catchall "I/O error" */
	sd->lasterror = rq->error;
	if (bp->b_cmd == BUF_CMD_READ) {
	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
		gravity = " fatal";
		set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
	    }
	    log(LOG_ERR,
		"%s:%s read error, offset %lld for %d bytes\n",
		gravity,
		sd->name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	} else {					    /* write operation */
	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
		gravity = "fatal ";
		set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
	    }
	    log(LOG_ERR,
		"%s:%s write error, offset %lld for %d bytes\n",
		gravity,
		sd->name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	}
	log(LOG_ERR,
	    "%s: user buffer offset %lld for %d bytes\n",
	    sd->name,
	    (long long)ubio->bio_offset,
	    ubio->bio_buf->b_bcount);
	if (rq->error == ENXIO) {			    /* the drive's down too */
	    log(LOG_ERR,
		"%s: fatal drive I/O error, offset %lld for %d bytes\n",
		DRIVE[rqe->driveno].label.name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	    DRIVE[rqe->driveno].lasterror = rq->error;
	    set_drive_state(rqe->driveno,		    /* take the drive down */
		drive_down,
		setstate_force);
	}
    }
    /* Now update the statistics */
    if (bp->b_cmd == BUF_CMD_READ) { 				/* read operation */
	DRIVE[rqe->driveno].reads++;
	DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
	SD[rqe->sdno].reads++;
	SD[rqe->sdno].bytes_read += bp->b_bcount;
	PLEX[rqe->rqg->plexno].reads++;
	PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
	    VOL[PLEX[rqe->rqg->plexno].volno].reads++;
	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
	}
    } else {						    /* write operation */
	DRIVE[rqe->driveno].writes++;
	DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
	SD[rqe->sdno].writes++;
	SD[rqe->sdno].bytes_written += bp->b_bcount;
	PLEX[rqe->rqg->plexno].writes++;
	PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
	    VOL[PLEX[rqe->rqg->plexno].volno].writes++;
	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
	}
    }
    if (rqg->flags & XFR_RECOVERY_READ) {		    /* recovery read, */
	int *sdata;					    /* source */
	int *data;					    /* and group data */
	int length;					    /* and count involved */
	int count;					    /* loop counter */
	struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */

	/* XOR destination is the user data */
	sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT];	/* old data contents */
	data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
	length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */

	for (count = 0; count < length; count++)
	    data[count] ^= sdata[count];

	/*
	 * In a normal read, we will normally read directly
	 * into the user buffer.  This doesn't work if
	 * we're also doing a recovery, so we have to
	 * copy it
	 */
	if (rqe->flags & XFR_NORMAL_READ) {		    /* normal read as well, */
	    char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
	    char *dst;

	    dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
	    length = rqe->datalen << DEV_BSHIFT;	    /* and count involved */
	    bcopy(src, dst, length);			    /* move it */
	}
    } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
    &&(rqg->active == 1))				    /* and this is the last active request */
	complete_raid5_write(rqe);
    /*
     * This is the earliest place where we can be
     * sure that the request has really finished,
     * since complete_raid5_write can issue new
     * requests.
     */
    rqg->active--;					    /* this request now finished */
    if (rqg->active == 0) {				    /* request group finished, */
	rq->active--;					    /* one less */
	if (rqg->lock) {				    /* got a lock? */
	    unlockrange(rqg->plexno, rqg->lock);	    /* yes, free it */
	    rqg->lock = 0;
	}
    }
    if (rq->active == 0) {				    /* request finished, */
#ifdef VINUMDEBUG
	if (debug & DEBUG_RESID) {
	    if (ubio->bio_buf->b_resid != 0)			    /* still something to transfer? */
		Debugger("resid");
	}
#endif

	if (rq->error) {				    /* did we have an error? */
	    if (rq->isplex) {				    /* plex operation, */
		ubio->bio_buf->b_flags |= B_ERROR;	    /* yes, propagate to user */
		ubio->bio_buf->b_error = rq->error;
	    } else {					    /* try to recover */
		di.rq = rq;
		queue_daemon_request(daemonrq_ioerror, di); /* let the daemon complete */
	    }
	} else {
	    ubio->bio_buf->b_resid = 0;			    /* completed our transfer */
	    if (rq->isplex == 0)			    /* volume request, */
		VOL[rq->volplex.volno].active--;	    /* another request finished */
	    biodone(ubio);				    /* top level buffer completed */
	    freerq(rq);					    /* return the request storage */
	}
    }
    rel_mplock();
}
Ejemplo n.º 4
0
/* attach an object to a superior object */
void
attachobject(struct vinum_ioctl_msg *msg)
{
    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
    int sdno;
    struct sd *sd;
    struct plex *plex;
    struct volume *vol;

    switch (msg->type) {
    case drive_object:					    /* you can't attach a drive to anything */
    case volume_object:					    /* nor a volume */
    case invalid_object:				    /* "this can't happen" */
	reply->error = EINVAL;
	reply->msg[0] = '\0';				    /* vinum(8) doesn't do this */
	return;

    case sd_object:
	sd = validsd(msg->index, reply);
	if (sd == NULL)					    /* not a valid subdisk  */
	    return;
	plex = validplex(msg->otherobject, reply);
	if (plex) {
	    /*
	     * We should be more intelligent about this.
	     * We should be able to reattach a dead
	     * subdisk, but if we want to increase the total
	     * number of subdisks, we have a lot of reshuffling
	     * to do. XXX
	     */
	    if ((plex->organization != plex_concat)	    /* can't attach to striped and RAID-4/5 */
	    &&(!msg->force)) {				    /* without using force */
		reply->error = EINVAL;			    /* no message, the user should check */
		strcpy(reply->msg, "Can't attach to this plex organization");
		return;
	    }
	    if (sd->plexno >= 0) {			    /* already belong to a plex */
		reply->error = EBUSY;			    /* no message, the user should check */
		reply->msg[0] = '\0';
		return;
	    }
	    sd->plexoffset = msg->offset;		    /* this is where we want it */
	    set_sd_state(sd->sdno, sd_stale, setstate_force); /* make sure it's stale */
	    give_sd_to_plex(plex->plexno, sd->sdno);	    /* and give it to the plex */
	    update_sd_config(sd->sdno, 0);
	    save_config();
	}
	if (sd->state == sd_reviving)
	    reply->error = EAGAIN;			    /* need to revive it */
	else
	    reply->error = 0;
	break;

    case plex_object:
	plex = validplex(msg->index, reply);		    /* get plex */
	if (plex == NULL)
	    return;
	vol = validvol(msg->otherobject, reply);	    /* and volume information */
	if (vol) {
	    if ((vol->plexes == MAXPLEX)		    /* we have too many already */
	    ||(plex->volno >= 0)) {			    /* or the plex has an owner */
		reply->error = EINVAL;			    /* no message, the user should check */
		reply->msg[0] = '\0';
		return;
	    }
	    for (sdno = 0; sdno < plex->subdisks; sdno++) {
		sd = &SD[plex->sdnos[sdno]];

		if (sd->state > sd_down)		    /* real subdisk, vaguely accessible */
		    set_sd_state(plex->sdnos[sdno], sd_stale, setstate_force); /* make it stale */
	    }
	    set_plex_state(plex->plexno, plex_up, setstate_none); /* update plex state */
	    give_plex_to_volume(msg->otherobject, msg->index); /* and give it to the volume */
	    update_plex_config(plex->plexno, 0);
	    save_config();
	    reply->error = 0;				    /* all went well */
	}
    }
}
Ejemplo n.º 5
0
/* Perform I/O on a subdisk */
void
sdio(struct buf *bp)
{
    int s;						    /* spl */
    struct sd *sd;
    struct sdbuf *sbp;
    daddr_t endoffset;
    struct drive *drive;

#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_sdio, (union rqinfou) bp, bp);
#endif
    sd = &SD[Sdno(bp->b_dev)];				    /* point to the subdisk */
    drive = &DRIVE[sd->driveno];

    if (drive->state != drive_up) {
	if (sd->state >= sd_crashed) {
	    if (bp->b_flags & B_READ)			    /* reading, */
		set_sd_state(sd->sdno, sd_crashed, setstate_force);
	    else
		set_sd_state(sd->sdno, sd_stale, setstate_force);
	}
	bp->b_flags |= B_ERROR;
	bp->b_error = EIO;
	biodone(bp);
	return;
    }
    /*
     * We allow access to any kind of subdisk as long as we can expect
     * to get the I/O performed.
     */
    if (sd->state < sd_empty) {				    /* nothing to talk to, */
	bp->b_flags |= B_ERROR;
	bp->b_error = EIO;
	biodone(bp);
	return;
    }
    /* Get a buffer */
    sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf));
    if (sbp == NULL) {
	bp->b_flags |= B_ERROR;
	bp->b_error = ENOMEM;
	biodone(bp);
	return;
    }
    bzero(sbp, sizeof(struct sdbuf));			    /* start with nothing */
    sbp->b.b_flags = bp->b_flags | B_CALL;		    /* inform us when it's done */
    sbp->b.b_bufsize = bp->b_bufsize;			    /* buffer size */
    sbp->b.b_bcount = bp->b_bcount;			    /* number of bytes to transfer */
    sbp->b.b_resid = bp->b_resid;			    /* and amount waiting */
    sbp->b.b_dev = DRIVE[sd->driveno].dev;		    /* device */
    sbp->b.b_data = bp->b_data;				    /* data buffer */
    sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
    sbp->b.b_iodone = sdio_done;			    /* come here on completion */
    BUF_LOCKINIT(&sbp->b);				    /* get a lock for the buffer */
    BUF_LOCK(&sbp->b, LK_EXCLUSIVE);			    /* and lock it */
    sbp->bp = bp;					    /* note the address of the original header */
    sbp->sdno = sd->sdno;				    /* note for statistics */
    sbp->driveno = sd->driveno;
    endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE;  /* final sector offset */
    if (endoffset > sd->sectors) {			    /* beyond the end */
	sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */
	if (sbp->b.b_bcount <= 0) {			    /* nothing to transfer */
	    bp->b_resid = bp->b_bcount;			    /* nothing transferred */
	    biodone(bp);
	    Free(sbp);
	    return;
	}
    }
#if VINUMDEBUG
    if (debug & DEBUG_ADDRESSES)
	log(LOG_DEBUG,
	    "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
	    sbp->b.b_flags & B_READ ? "Read" : "Write",
	    major(sbp->b.b_dev),
	    minor(sbp->b.b_dev),
	    sbp->sdno,
	    (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset),
	    (int) sbp->b.b_blkno,
	    sbp->b.b_bcount);
#endif
    s = splbio();
#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b);
#endif
    BUF_STRATEGY(&sbp->b, 0);
    splx(s);
}