예제 #1
0
파일: vinumlock.c 프로젝트: MarginC/kame
/* Unlock a volume and let the next one at it */
void
unlockrange(int plexno, struct rangelock *lock)
{
    struct plex *plex;

    plex = &PLEX[plexno];
#ifdef DIAGNOSTIC
    if (lock < &plex->lock[0] || lock >= &plex->lock[PLEX_LOCKS])
	panic("vinum: rangelock %p on plex %d invalid, not between %p and %p",
	    lock,
	    plexno,
	    &plex->lock[0],
	    &plex->lock[PLEX_LOCKS]);
#endif
#ifdef VINUMDEBUG
    if (debug & DEBUG_LOCKREQS) {
	struct rangelockinfo lockinfo;

	lockinfo.stripe = lock->stripe;
	lockinfo.bp = lock->bp;
	lockinfo.plexno = plex->plexno;
	logrq(loginfo_lockwait, (union rqinfou) &lockinfo, lock->bp);
    }
#endif
    lock->stripe = 0;					    /* no longer used */
    plex->usedlocks--;					    /* one less lock */
    if (plex->usedlocks == PLEX_LOCKS - 1)		    /* we were full, */
	wakeup(&plex->usedlocks);			    /* get a waiter if one's there */
    wakeup((void *) lock);
}
/* I/O on subdisk completed */
void
sdio_done(struct bio *bio)
{
    struct sdbuf *sbp;

    get_mplock();

    sbp = (struct sdbuf *) bio->bio_buf;
    if (sbp->b.b_flags & B_ERROR) {			    /* had an error */
	sbp->bio->bio_buf->b_flags |= B_ERROR;			    /* propagate upwards */
	sbp->bio->bio_buf->b_error = sbp->b.b_error;
    }
#ifdef VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_sdiodone, (union rqinfou)bio, bio);
#endif
    sbp->bio->bio_buf->b_resid = sbp->b.b_resid;			    /* copy the resid field */
    /* Now update the statistics */
    if (sbp->b.b_cmd == BUF_CMD_READ) {			    /* read operation */
	DRIVE[sbp->driveno].reads++;
	DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
	SD[sbp->sdno].reads++;
	SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
    } else {						    /* write operation */
	DRIVE[sbp->driveno].writes++;
	DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
	SD[sbp->sdno].writes++;
	SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
    }
    biodone_sync(bio);
    biodone(sbp->bio);					    /* complete the caller's I/O */
    BUF_UNLOCK(&sbp->b);
    uninitbufbio(&sbp->b);
    Free(sbp);
    rel_mplock();
}
예제 #3
0
파일: vinumlock.c 프로젝트: MarginC/kame
/* Lock a stripe of a plex, wait if it's in use */
struct rangelock *
lockrange(daddr_t stripe, struct buf *bp, struct plex *plex)
{
    struct rangelock *lock;
    struct rangelock *pos;				    /* position of first free lock */
    int foundlocks;					    /* number of locks found */

    /*
     * We could get by without counting the number
     * of locks we find, but we have a linear search
     * through a table which in most cases will be
     * empty.  It's faster to stop when we've found
     * all the locks that are there.  This is also
     * the reason why we put pos at the beginning
     * instead of the end, though it requires an
     * extra test.
     */
    pos = NULL;
    foundlocks = 0;

    /*
     * we can't use 0 as a valid address, so
     * increment all addresses by 1.
     */
    stripe++;
    mtx_lock(&plex->lockmtx);

    /* Wait here if the table is full */
    while (plex->usedlocks == PLEX_LOCKS)		    /* all in use */
	msleep(&plex->usedlocks, &plex->lockmtx, PRIBIO, "vlock", 0);

#ifdef DIAGNOSTIC
    if (plex->usedlocks >= PLEX_LOCKS)
	panic("lockrange: Too many locks in use");
#endif

    lock = plex->lock;					    /* pointer in lock table */
    if (plex->usedlocks > 0)				    /* something locked, */
	/* Search the lock table for our stripe */
	for (; lock < &plex->lock[PLEX_LOCKS]
	    && foundlocks < plex->usedlocks;
	    lock++) {
	    if (lock->stripe) {				    /* in use */
		foundlocks++;				    /* found another one in use */
		if ((lock->stripe == stripe)		    /* it's our stripe */
		&&(lock->bp != bp)) {			    /* but not our request */
#ifdef VINUMDEBUG
		    if (debug & DEBUG_LOCKREQS) {
			struct rangelockinfo lockinfo;

			lockinfo.stripe = stripe;
			lockinfo.bp = bp;
			lockinfo.plexno = plex->plexno;
			logrq(loginfo_lockwait, (union rqinfou) &lockinfo, bp);
		    }
#endif
		    plex->lockwaits++;			    /* waited one more time */
		    msleep(lock, &plex->lockmtx, PRIBIO, "vrlock", 0);
		    lock = &plex->lock[-1];		    /* start again */
		    foundlocks = 0;
		    pos = NULL;
		}
	    } else if (pos == NULL)			    /* still looking for somewhere? */
		pos = lock;				    /* a place to put this one */
	}
    /*
     * This untidy looking code ensures that we'll
     * always end up pointing to the first free lock
     * entry, thus minimizing the number of
     * iterations necessary.
     */
    if (pos == NULL)					    /* didn't find one on the way, */
	pos = lock;					    /* use the one we're pointing to */

    /*
     * The address range is free, and we're pointing
     * to the first unused entry.  Make it ours.
     */
    pos->stripe = stripe;
    pos->bp = bp;
    plex->usedlocks++;					    /* one more lock */
    mtx_unlock(&plex->lockmtx);
#ifdef VINUMDEBUG
    if (debug & DEBUG_LOCKREQS) {
	struct rangelockinfo lockinfo;

	lockinfo.stripe = stripe;
	lockinfo.bp = bp;
	lockinfo.plexno = plex->plexno;
	logrq(loginfo_lock, (union rqinfou) &lockinfo, bp);
    }
#endif
    return pos;
}
/*
 * Take a completed buffer, transfer the data back if
 * it's a read, and complete the high-level request
 * if this is the last subrequest.
 *
 * The bp parameter is in fact a struct rqelement, which
 * includes a couple of extras at the end.
 */
void
complete_rqe(struct bio *bio)
{
    union daemoninfo di;
    struct buf *bp = bio->bio_buf;
    struct rqelement *rqe;
    struct request *rq;
    struct rqgroup *rqg;
    struct bio *ubio;					    /* user buffer */
    struct drive *drive;
    struct sd *sd;
    char *gravity;					    /* for error messages */

    get_mplock();

    rqe = (struct rqelement *) bp;			    /* point to the element that completed */
    rqg = rqe->rqg;					    /* and the request group */
    rq = rqg->rq;					    /* and the complete request */
    ubio = rq->bio;					    /* user buffer */

#ifdef VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_iodone, (union rqinfou) rqe, ubio);
#endif
    drive = &DRIVE[rqe->driveno];
    drive->active--;					    /* one less outstanding I/O on this drive */
    vinum_conf.active--;				    /* one less outstanding I/O globally */
    if ((drive->active == (DRIVE_MAXACTIVE - 1))	    /* we were at the drive limit */
    ||(vinum_conf.active == VINUM_MAXACTIVE))		    /* or the global limit */
	wakeup(&launch_requests);			    /* let another one at it */
    if ((bp->b_flags & B_ERROR) != 0) {			    /* transfer in error */
	gravity = "";
	sd = &SD[rqe->sdno];

	if (bp->b_error != 0)				    /* did it return a number? */
	    rq->error = bp->b_error;			    /* yes, put it in. */
	else if (rq->error == 0)			    /* no: do we have one already? */
	    rq->error = EIO;				    /* no: catchall "I/O error" */
	sd->lasterror = rq->error;
	if (bp->b_cmd == BUF_CMD_READ) {
	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
		gravity = " fatal";
		set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
	    }
	    log(LOG_ERR,
		"%s:%s read error, offset %lld for %d bytes\n",
		gravity,
		sd->name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	} else {					    /* write operation */
	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
		gravity = "fatal ";
		set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
	    }
	    log(LOG_ERR,
		"%s:%s write error, offset %lld for %d bytes\n",
		gravity,
		sd->name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	}
	log(LOG_ERR,
	    "%s: user buffer offset %lld for %d bytes\n",
	    sd->name,
	    (long long)ubio->bio_offset,
	    ubio->bio_buf->b_bcount);
	if (rq->error == ENXIO) {			    /* the drive's down too */
	    log(LOG_ERR,
		"%s: fatal drive I/O error, offset %lld for %d bytes\n",
		DRIVE[rqe->driveno].label.name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	    DRIVE[rqe->driveno].lasterror = rq->error;
	    set_drive_state(rqe->driveno,		    /* take the drive down */
		drive_down,
		setstate_force);
	}
    }
    /* Now update the statistics */
    if (bp->b_cmd == BUF_CMD_READ) { 				/* read operation */
	DRIVE[rqe->driveno].reads++;
	DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
	SD[rqe->sdno].reads++;
	SD[rqe->sdno].bytes_read += bp->b_bcount;
	PLEX[rqe->rqg->plexno].reads++;
	PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
	    VOL[PLEX[rqe->rqg->plexno].volno].reads++;
	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
	}
    } else {						    /* write operation */
	DRIVE[rqe->driveno].writes++;
	DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
	SD[rqe->sdno].writes++;
	SD[rqe->sdno].bytes_written += bp->b_bcount;
	PLEX[rqe->rqg->plexno].writes++;
	PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
	    VOL[PLEX[rqe->rqg->plexno].volno].writes++;
	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
	}
    }
    if (rqg->flags & XFR_RECOVERY_READ) {		    /* recovery read, */
	int *sdata;					    /* source */
	int *data;					    /* and group data */
	int length;					    /* and count involved */
	int count;					    /* loop counter */
	struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */

	/* XOR destination is the user data */
	sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT];	/* old data contents */
	data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
	length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */

	for (count = 0; count < length; count++)
	    data[count] ^= sdata[count];

	/*
	 * In a normal read, we will normally read directly
	 * into the user buffer.  This doesn't work if
	 * we're also doing a recovery, so we have to
	 * copy it
	 */
	if (rqe->flags & XFR_NORMAL_READ) {		    /* normal read as well, */
	    char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
	    char *dst;

	    dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
	    length = rqe->datalen << DEV_BSHIFT;	    /* and count involved */
	    bcopy(src, dst, length);			    /* move it */
	}
    } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
    &&(rqg->active == 1))				    /* and this is the last active request */
	complete_raid5_write(rqe);
    /*
     * This is the earliest place where we can be
     * sure that the request has really finished,
     * since complete_raid5_write can issue new
     * requests.
     */
    rqg->active--;					    /* this request now finished */
    if (rqg->active == 0) {				    /* request group finished, */
	rq->active--;					    /* one less */
	if (rqg->lock) {				    /* got a lock? */
	    unlockrange(rqg->plexno, rqg->lock);	    /* yes, free it */
	    rqg->lock = 0;
	}
    }
    if (rq->active == 0) {				    /* request finished, */
#ifdef VINUMDEBUG
	if (debug & DEBUG_RESID) {
	    if (ubio->bio_buf->b_resid != 0)			    /* still something to transfer? */
		Debugger("resid");
	}
#endif

	if (rq->error) {				    /* did we have an error? */
	    if (rq->isplex) {				    /* plex operation, */
		ubio->bio_buf->b_flags |= B_ERROR;	    /* yes, propagate to user */
		ubio->bio_buf->b_error = rq->error;
	    } else {					    /* try to recover */
		di.rq = rq;
		queue_daemon_request(daemonrq_ioerror, di); /* let the daemon complete */
	    }
	} else {
	    ubio->bio_buf->b_resid = 0;			    /* completed our transfer */
	    if (rq->isplex == 0)			    /* volume request, */
		VOL[rq->volplex.volno].active--;	    /* another request finished */
	    biodone(ubio);				    /* top level buffer completed */
	    freerq(rq);					    /* return the request storage */
	}
    }
    rel_mplock();
}
/* Start the second phase of a RAID-4 or RAID-5 group write operation. */
void
complete_raid5_write(struct rqelement *rqe)
{
    int *sdata;						    /* source */
    int *pdata;						    /* and parity block data */
    int length;						    /* and count involved */
    int count;						    /* loop counter */
    int rqno;						    /* request index */
    int rqoffset;					    /* offset of request data from parity data */
    struct bio *ubio;					    /* user buffer header */
    struct request *rq;					    /* pointer to our request */
    struct rqgroup *rqg;				    /* and to the request group */
    struct rqelement *prqe;				    /* point to the parity block */
    struct drive *drive;				    /* drive to access */
    rqg = rqe->rqg;					    /* and to our request group */
    rq = rqg->rq;					    /* point to our request */
    ubio = rq->bio;					    /* user's buffer header */
    prqe = &rqg->rqe[0];				    /* point to the parity block */

    /*
     * If we get to this function, we have normal or
     * degraded writes, or a combination of both.  We do
     * the same thing in each case: we perform an
     * exclusive or to the parity block.  The only
     * difference is the origin of the data and the
     * address range.
     */
    if (rqe->flags & XFR_DEGRADED_WRITE) {		    /* do the degraded write stuff */
	pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
	bzero(pdata, prqe->grouplen << DEV_BSHIFT);	    /* start with nothing in the parity block */

	/* Now get what data we need from each block */
	for (rqno = 1; rqno < rqg->count; rqno++) {	    /* for all the data blocks */
	    rqe = &rqg->rqe[rqno];			    /* this request */
	    sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
	    length = rqe->grouplen << (DEV_BSHIFT - 2);	    /* and count involved */

	    /*
	     * Add the data block to the parity block.  Before
	     * we started the request, we zeroed the parity
	     * block, so the result of adding all the other
	     * blocks and the block we want to write will be
	     * the correct parity block.
	     */
	    for (count = 0; count < length; count++)
		pdata[count] ^= sdata[count];
	    if ((rqe->flags & XFR_MALLOCED)		    /* the buffer was malloced, */
	    &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {	    /* and we have no normal write, */
		Free(rqe->b.b_data);			    /* free it now */
		rqe->flags &= ~XFR_MALLOCED;
	    }
	}
    }
    if (rqg->flags & XFR_NORMAL_WRITE) {		    /* do normal write stuff */
	/* Get what data we need from each block */
	for (rqno = 1; rqno < rqg->count; rqno++) {	    /* for all the data blocks */
	    rqe = &rqg->rqe[rqno];			    /* this request */
	    if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
		== (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
		sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
		rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
		pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
		length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */

		/*
		 * "remove" the old data block
		 * from the parity block
		 */
		if ((pdata < ((int *) prqe->b.b_data))
		    || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
		    || (sdata < ((int *) rqe->b.b_data))
		    || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
		    panic("complete_raid5_write: bounds overflow");
		for (count = 0; count < length; count++)
		    pdata[count] ^= sdata[count];

		/* "add" the new data block */
		sdata = (int *) (&ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
		if ((sdata < ((int *) ubio->bio_buf->b_data))
		    || (&sdata[length] > ((int *) (ubio->bio_buf->b_data + ubio->bio_buf->b_bcount))))
		    panic("complete_raid5_write: bounds overflow");
		for (count = 0; count < length; count++)
		    pdata[count] ^= sdata[count];

		/* Free the malloced buffer */
		if (rqe->flags & XFR_MALLOCED) {	    /* the buffer was malloced, */
		    Free(rqe->b.b_data);		    /* free it */
		    rqe->flags &= ~XFR_MALLOCED;
		} else
		    panic("complete_raid5_write: malloc conflict");

		if ((rqe->b.b_cmd == BUF_CMD_READ)	    /* this was a read */
		&&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
		    rqe->b.b_cmd = BUF_CMD_WRITE;   /* we're writing now */
		    rqe->b.b_bio1.bio_done = complete_rqe;	    /* by calling us here */
		    rqe->flags &= ~XFR_PARITYOP;	    /* reset flags that brought us here */
		    rqe->b.b_data = &ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
		    rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
		    rqe->b.b_resid = rqe->b.b_bcount;	    /* nothing transferred */
		    rqe->b.b_bio1.bio_offset += (off_t)rqe->dataoffset << DEV_BSHIFT;	    /* point to the correct block */
		    drive = &DRIVE[rqe->driveno];	    /* drive to access */
		    rqe->b.b_bio1.bio_driver_info = drive->dev;
		    rqg->active++;			    /* another active request */

							    /* We can't sleep here, so we just increment the counters. */
		    drive->active++;
		    if (drive->active >= drive->maxactive)
			drive->maxactive = drive->active;
		    vinum_conf.active++;
		    if (vinum_conf.active >= vinum_conf.maxactive)
			vinum_conf.maxactive = vinum_conf.active;
#if VINUMDEBUG
		    if (debug & DEBUG_ADDRESSES)
			log(LOG_DEBUG,
			    "  %s dev %s, sd %d, offset 0x%jx, devoffset 0x%jx, length %d\n",
			    (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write",
			    drive->devicename,
			    rqe->sdno,
			    (uintmax_t)(rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT)),
			    (uintmax_t)rqe->b.b_bio1.bio_offset,
			    rqe->b.b_bcount);
		    if (debug & DEBUG_LASTREQS)
			logrq(loginfo_raid5_data, (union rqinfou) rqe, ubio);
#endif
		    vn_strategy(drive->vp, &rqe->b.b_bio1);
		}
	    }
	}
    }
    /* Finally, write the parity block */
    rqe = &rqg->rqe[0];
    rqe->b.b_cmd = BUF_CMD_WRITE;		    /* we're writing now */
    rqe->b.b_bio1.bio_done = complete_rqe;			    /* by calling us here */
    rqg->flags &= ~XFR_PARITYOP;			    /* reset flags that brought us here */
    rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;	    /* length to write */
    rqe->b.b_resid = rqe->b.b_bcount;			    /* nothing transferred */
    drive = &DRIVE[rqe->driveno];			    /* drive to access */
    rqe->b.b_bio1.bio_driver_info = drive->dev;
    rqg->active++;					    /* another active request */

    /* We can't sleep here, so we just increment the counters. */
    drive->active++;
    if (drive->active >= drive->maxactive)
	drive->maxactive = drive->active;
    vinum_conf.active++;
    if (vinum_conf.active >= vinum_conf.maxactive)
	vinum_conf.maxactive = vinum_conf.active;

#if VINUMDEBUG
    if (debug & DEBUG_ADDRESSES)
	log(LOG_DEBUG,
	    "  %s dev %s, sd %d, offset 0x%jx, devoffset 0x%jx, length %d\n",
	    (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write",
	    drive->devicename,
	    rqe->sdno,
	    (uintmax_t)(rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT)),
	    (uintmax_t)rqe->b.b_bio1.bio_offset,
	    rqe->b.b_bcount);
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubio);
#endif
    vn_strategy(drive->vp, &rqe->b.b_bio1);
}
예제 #6
0
/* Perform I/O on a subdisk */
void
sdio(struct buf *bp)
{
    int s;						    /* spl */
    struct sd *sd;
    struct sdbuf *sbp;
    daddr_t endoffset;
    struct drive *drive;

#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_sdio, (union rqinfou) bp, bp);
#endif
    sd = &SD[Sdno(bp->b_dev)];				    /* point to the subdisk */
    drive = &DRIVE[sd->driveno];

    if (drive->state != drive_up) {
	if (sd->state >= sd_crashed) {
	    if (bp->b_flags & B_READ)			    /* reading, */
		set_sd_state(sd->sdno, sd_crashed, setstate_force);
	    else
		set_sd_state(sd->sdno, sd_stale, setstate_force);
	}
	bp->b_flags |= B_ERROR;
	bp->b_error = EIO;
	biodone(bp);
	return;
    }
    /*
     * We allow access to any kind of subdisk as long as we can expect
     * to get the I/O performed.
     */
    if (sd->state < sd_empty) {				    /* nothing to talk to, */
	bp->b_flags |= B_ERROR;
	bp->b_error = EIO;
	biodone(bp);
	return;
    }
    /* Get a buffer */
    sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf));
    if (sbp == NULL) {
	bp->b_flags |= B_ERROR;
	bp->b_error = ENOMEM;
	biodone(bp);
	return;
    }
    bzero(sbp, sizeof(struct sdbuf));			    /* start with nothing */
    sbp->b.b_flags = bp->b_flags | B_CALL;		    /* inform us when it's done */
    sbp->b.b_bufsize = bp->b_bufsize;			    /* buffer size */
    sbp->b.b_bcount = bp->b_bcount;			    /* number of bytes to transfer */
    sbp->b.b_resid = bp->b_resid;			    /* and amount waiting */
    sbp->b.b_dev = DRIVE[sd->driveno].dev;		    /* device */
    sbp->b.b_data = bp->b_data;				    /* data buffer */
    sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
    sbp->b.b_iodone = sdio_done;			    /* come here on completion */
    BUF_LOCKINIT(&sbp->b);				    /* get a lock for the buffer */
    BUF_LOCK(&sbp->b, LK_EXCLUSIVE);			    /* and lock it */
    sbp->bp = bp;					    /* note the address of the original header */
    sbp->sdno = sd->sdno;				    /* note for statistics */
    sbp->driveno = sd->driveno;
    endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE;  /* final sector offset */
    if (endoffset > sd->sectors) {			    /* beyond the end */
	sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */
	if (sbp->b.b_bcount <= 0) {			    /* nothing to transfer */
	    bp->b_resid = bp->b_bcount;			    /* nothing transferred */
	    biodone(bp);
	    Free(sbp);
	    return;
	}
    }
#if VINUMDEBUG
    if (debug & DEBUG_ADDRESSES)
	log(LOG_DEBUG,
	    "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
	    sbp->b.b_flags & B_READ ? "Read" : "Write",
	    major(sbp->b.b_dev),
	    minor(sbp->b.b_dev),
	    sbp->sdno,
	    (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset),
	    (int) sbp->b.b_blkno,
	    sbp->b.b_bcount);
#endif
    s = splbio();
#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b);
#endif
    BUF_STRATEGY(&sbp->b, 0);
    splx(s);
}
예제 #7
0
/*
 * Call the low-level strategy routines to
 * perform the requests in a struct request
 */
int
launch_requests(struct request *rq, int reviveok)
{
    int s;
    struct rqgroup *rqg;
    int rqno;						    /* loop index */
    struct rqelement *rqe;				    /* current element */
    struct drive *drive;
    int rcount;						    /* request count */

    /*
     * First find out whether we're reviving, and the
     * request contains a conflict.  If so, we hang
     * the request off plex->waitlist of the first
     * plex we find which is reviving
     */
    if ((rq->flags & XFR_REVIVECONFLICT)		    /* possible revive conflict */
    &&(!reviveok)) {					    /* and we don't want to do it now, */
	struct sd *sd;
	struct request *waitlist;			    /* point to the waitlist */

	sd = &SD[rq->sdno];
	if (sd->waitlist != NULL) {			    /* something there already, */
	    waitlist = sd->waitlist;
	    while (waitlist->next != NULL)		    /* find the end */
		waitlist = waitlist->next;
	    waitlist->next = rq;			    /* hook our request there */
	} else
	    sd->waitlist = rq;				    /* hook our request at the front */

#if VINUMDEBUG
	if (debug & DEBUG_REVIVECONFLICT)
	    log(LOG_DEBUG,
		"Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%x, length %ld\n",
		rq->sdno,
		rq,
		rq->bp->b_flags & B_READ ? "Read" : "Write",
		major(rq->bp->b_dev),
		minor(rq->bp->b_dev),
		rq->bp->b_blkno,
		rq->bp->b_bcount);
#endif
	return 0;					    /* and get out of here */
    }
    rq->active = 0;					    /* nothing yet */
#if VINUMDEBUG
    if (debug & DEBUG_ADDRESSES)
	log(LOG_DEBUG,
	    "Request: %p\n%s dev %d.%d, offset 0x%x, length %ld\n",
	    rq,
	    rq->bp->b_flags & B_READ ? "Read" : "Write",
	    major(rq->bp->b_dev),
	    minor(rq->bp->b_dev),
	    rq->bp->b_blkno,
	    rq->bp->b_bcount);
    vinum_conf.lastrq = rq;
    vinum_conf.lastbuf = rq->bp;
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp);
#endif

    /*
     * With the division of labour below (first count the requests, then
     * issue them), it's possible that we don't need this splbio()
     * protection.  But I'll try that some other time.
     */
    s = splbio();
    for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) {	    /* through the whole request chain */
	rqg->active = rqg->count;			    /* they're all active */
	for (rqno = 0; rqno < rqg->count; rqno++) {
	    rqe = &rqg->rqe[rqno];
	    if (rqe->flags & XFR_BAD_SUBDISK)		    /* this subdisk is bad, */
		rqg->active--;				    /* one less active request */
	}
	if (rqg->active)				    /* we have at least one active request, */
	    rq->active++;				    /* one more active request group */
    }

    /* Now fire off the requests */
    for (rqg = rq->rqg; rqg != NULL;) {			    /* through the whole request chain */
	if (rqg->lockbase >= 0)				    /* this rqg needs a lock first */
	    rqg->lock = lockrange(rqg->lockbase, rqg->rq->bp, &PLEX[rqg->plexno]);
	rcount = rqg->count;
	for (rqno = 0; rqno < rcount;) {
	    rqe = &rqg->rqe[rqno];

	    /*
	     * Point to next rqg before the bottom end
	     * changes the structures.
	     */
	    if (++rqno >= rcount)
		rqg = rqg->next;
	    if ((rqe->flags & XFR_BAD_SUBDISK) == 0) {	    /* this subdisk is good, */
		drive = &DRIVE[rqe->driveno];		    /* look at drive */
		drive->active++;
		if (drive->active >= drive->maxactive)
		    drive->maxactive = drive->active;
		vinum_conf.active++;
		if (vinum_conf.active >= vinum_conf.maxactive)
		    vinum_conf.maxactive = vinum_conf.active;

#if VINUMDEBUG
		if (debug & DEBUG_ADDRESSES)
		    log(LOG_DEBUG,
			"  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
			rqe->b.b_flags & B_READ ? "Read" : "Write",
			major(rqe->b.b_dev),
			minor(rqe->b.b_dev),
			rqe->sdno,
			(u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
			rqe->b.b_blkno,
			rqe->b.b_bcount);
		if (debug & DEBUG_LASTREQS)
		    logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp);
#endif


		/* fire off the request */
		BUF_STRATEGY(&rqe->b, 0);
	    }
	}
    }
    splx(s);
    return 0;
}
예제 #8
0
/*
 * Start a transfer.  Return -1 on error,
 * 0 if OK, 1 if we need to retry.
 * Parameter reviveok is set when doing
 * transfers for revives: it allows transfers to
 * be started immediately when a revive is in
 * progress.  During revive, normal transfers
 * are queued if they share address space with
 * a currently active revive operation.
 */
int
vinumstart(struct buf *bp, int reviveok)
{
    int plexno;
    int maxplex;					    /* maximum number of plexes to handle */
    struct volume *vol;
    struct request *rq;					    /* build up our request here */
    enum requeststatus status;

#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_user_bp, (union rqinfou) bp, bp);
#endif

    if ((bp->b_bcount % DEV_BSIZE) != 0) {		    /* bad length */
	bp->b_error = EINVAL;				    /* invalid size */
	bp->b_flags |= B_ERROR;
	biodone(bp);
	return -1;
    }
    rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */
    if (rq == NULL) {					    /* can't do it */
	bp->b_error = ENOMEM;				    /* can't get memory */
	bp->b_flags |= B_ERROR;
	biodone(bp);
	return -1;
    }
    bzero(rq, sizeof(struct request));

    /*
     * Note the volume ID.  This can be NULL, which
     * the request building functions use as an
     * indication for single plex I/O
     */
    rq->bp = bp;					    /* and the user buffer struct */

    if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) {	    /* it's a volume, */
	rq->volplex.volno = Volno(bp->b_dev);		    /* get the volume number */
	vol = &VOL[rq->volplex.volno];			    /* and point to it */
	vol->active++;					    /* one more active request */
	maxplex = vol->plexes;				    /* consider all its plexes */
    } else {
	vol = NULL;					    /* no volume */
	rq->volplex.plexno = Plexno(bp->b_dev);		    /* point to the plex */
	rq->isplex = 1;					    /* note that it's a plex */
	maxplex = 1;					    /* just the one plex */
    }

    if (bp->b_flags & B_READ) {
	/*
	 * This is a read request.  Decide
	 * which plex to read from.
	 *
	 * There's a potential race condition here,
	 * since we're not locked, and we could end
	 * up multiply incrementing the round-robin
	 * counter.  This doesn't have any serious
	 * effects, however.
	 */
	if (vol != NULL) {
	    vol->reads++;
	    plexno = vol->preferred_plex;		    /* get the plex to use */
	    if (plexno < 0) {				    /* round robin */
		plexno = vol->last_plex_read;
		vol->last_plex_read++;
		if (vol->last_plex_read >= vol->plexes)	    /* got the the end? */
		    vol->last_plex_read = 0;		    /* wrap around */
	    }
	    status = build_read_request(rq, plexno);	    /* build a request */
	} else {
	    daddr_t diskaddr = bp->b_blkno;		    /* start offset of transfer */
	    status = bre(rq,				    /* build a request list */
		rq->volplex.plexno,
		&diskaddr,
		diskaddr + (bp->b_bcount / DEV_BSIZE));
	}

	if ((status > REQUEST_RECOVERED)		    /* can't satisfy it */
	||(bp->b_flags & B_DONE)) {			    /* XXX shouldn't get this without bad status */
	    if (status == REQUEST_DOWN) {		    /* not enough subdisks */
		bp->b_error = EIO;			    /* I/O error */
		bp->b_flags |= B_ERROR;
	    }
	    biodone(bp);
	    freerq(rq);
	    return -1;
	}
	return launch_requests(rq, reviveok);		    /* now start the requests if we can */
    } else
	/*
	 * This is a write operation.  We write to all plexes.  If this is
	 * a RAID-4 or RAID-5 plex, we must also update the parity stripe.
	 */
    {
	if (vol != NULL) {
	    vol->writes++;
	    status = build_write_request(rq);		    /* Not all the subdisks are up */
	} else {					    /* plex I/O */
	    daddr_t diskstart;

	    diskstart = bp->b_blkno;			    /* start offset of transfer */
	    status = bre(rq,
		Plexno(bp->b_dev),
		&diskstart,
		bp->b_blkno + (bp->b_bcount / DEV_BSIZE));  /* build requests for the plex */
	}
	if ((status > REQUEST_RECOVERED)		    /* can't satisfy it */
	||(bp->b_flags & B_DONE)) {			    /* XXX shouldn't get this without bad status */
	    if (status == REQUEST_DOWN) {		    /* not enough subdisks */
		bp->b_error = EIO;			    /* I/O error */
		bp->b_flags |= B_ERROR;
	    }
	    if ((bp->b_flags & B_DONE) == 0)
		biodone(bp);
	    freerq(rq);
	    return -1;
	}
	return launch_requests(rq, reviveok);		    /* now start the requests if we can */
    }
}
예제 #9
0
/* Lock a stripe of a plex, wait if it's in use */
struct rangelock *
lockrange(daddr_t stripe, struct buf *bp, struct plex *plex)
{
    int s;
    struct rangelock *lock;
    struct rangelock *pos;				    /* position of first free lock */
    int foundlocks;					    /* number of locks found */
    int newlock;

    /*
     * We could get by without counting the number
     * of locks we find, but we have a linear search
     * through a table which in most cases will be
     * empty.  It's faster to stop when we've found
     * all the locks that are there.  This is also
     * the reason why we put pos at the beginning
     * instead of the end, though it requires an
     * extra test.
     */
    pos = NULL;
    foundlocks = 0;

    /*
     * we can't use 0 as a valid address, so
     * increment all addresses by 1.
     */
    stripe++;
    /*
     * We give the locks back from an interrupt
     * context, so we need to raise the spl here.
     */
    s = splbio();

    /* Search the lock table for our stripe */
    for (lock = plex->lock;
	lock < &plex->lock[plex->alloclocks]
	&& foundlocks < plex->usedlocks;
	lock++) {
	if (lock->stripe) {				    /* in use */
	    foundlocks++;				    /* found another one in use */
	    if ((lock->stripe == stripe)		    /* it's our stripe */
&&(lock->plexno == plex->plexno)			    /* and our plex */
	    &&(lock->bp != bp)) {			    /* but not our request */
		/*
		 * It would be nice to sleep on the lock
		 * itself, but it could get moved if the
		 * table expands during the wait.  Wait on
		 * the lock address + 1 (since waiting on
		 * 0 isn't allowed) instead.  It isn't
		 * exactly unique, but we won't have many
		 * conflicts.  The worst effect of a
		 * conflict would be an additional
		 * schedule and time through this loop.
		 */
#ifdef VINUMDEBUG
		if (debug & DEBUG_LASTREQS) {
		    struct rangelock info;

		    info.stripe = stripe;
		    info.bp = bp;
		    info.plexno = plex->plexno;
		    logrq(loginfo_lockwait, (union rqinfou) &info, bp);
		}
#endif
		plex->lockwaits++;			    /* waited one more time */
		while (lock->stripe)			    /* wait for it to become free */
		    tsleep((void *) lock->stripe, PRIBIO, "vrlock", 2 * hz);
		break;					    /* out of the inner level loop */
	    }
	} else {
	    if (pos == NULL)				    /* still looking for somewhere? */
		pos = lock;				    /* a place to put this one */
	}
    }

    /*
     * The address range is free.  Add our lock
     * entry.
     */
    if (pos == NULL) {					    /* Didn't find an entry */
	if (foundlocks >= plex->alloclocks) {		    /* searched the lot, */
	    newlock = plex->alloclocks;
	    EXPAND(plex->lock, struct rangelock, plex->alloclocks, INITIAL_LOCKS);
	    pos = &plex->lock[newlock];
	    while (newlock < plex->alloclocks)
		plex->lock[newlock++].stripe = 0;
	} else