Пример #1
0
/*
 * Abort a request: free resources and complete the
 * user request with the specified error
 */
int
abortrequest(struct request *rq, int error)
{
    struct buf *bp = rq->bp;				    /* user buffer */

    bp->b_flags |= B_ERROR;
    bp->b_error = error;
    freerq(rq);						    /* free everything we're doing */
    biodone(bp);
    return error;					    /* and give up */
}
/*
 * Take a completed buffer, transfer the data back if
 * it's a read, and complete the high-level request
 * if this is the last subrequest.
 *
 * The bp parameter is in fact a struct rqelement, which
 * includes a couple of extras at the end.
 */
void
complete_rqe(struct bio *bio)
{
    union daemoninfo di;
    struct buf *bp = bio->bio_buf;
    struct rqelement *rqe;
    struct request *rq;
    struct rqgroup *rqg;
    struct bio *ubio;					    /* user buffer */
    struct drive *drive;
    struct sd *sd;
    char *gravity;					    /* for error messages */

    get_mplock();

    rqe = (struct rqelement *) bp;			    /* point to the element that completed */
    rqg = rqe->rqg;					    /* and the request group */
    rq = rqg->rq;					    /* and the complete request */
    ubio = rq->bio;					    /* user buffer */

#ifdef VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_iodone, (union rqinfou) rqe, ubio);
#endif
    drive = &DRIVE[rqe->driveno];
    drive->active--;					    /* one less outstanding I/O on this drive */
    vinum_conf.active--;				    /* one less outstanding I/O globally */
    if ((drive->active == (DRIVE_MAXACTIVE - 1))	    /* we were at the drive limit */
    ||(vinum_conf.active == VINUM_MAXACTIVE))		    /* or the global limit */
	wakeup(&launch_requests);			    /* let another one at it */
    if ((bp->b_flags & B_ERROR) != 0) {			    /* transfer in error */
	gravity = "";
	sd = &SD[rqe->sdno];

	if (bp->b_error != 0)				    /* did it return a number? */
	    rq->error = bp->b_error;			    /* yes, put it in. */
	else if (rq->error == 0)			    /* no: do we have one already? */
	    rq->error = EIO;				    /* no: catchall "I/O error" */
	sd->lasterror = rq->error;
	if (bp->b_cmd == BUF_CMD_READ) {
	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
		gravity = " fatal";
		set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
	    }
	    log(LOG_ERR,
		"%s:%s read error, offset %lld for %d bytes\n",
		gravity,
		sd->name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	} else {					    /* write operation */
	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
		gravity = "fatal ";
		set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
	    }
	    log(LOG_ERR,
		"%s:%s write error, offset %lld for %d bytes\n",
		gravity,
		sd->name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	}
	log(LOG_ERR,
	    "%s: user buffer offset %lld for %d bytes\n",
	    sd->name,
	    (long long)ubio->bio_offset,
	    ubio->bio_buf->b_bcount);
	if (rq->error == ENXIO) {			    /* the drive's down too */
	    log(LOG_ERR,
		"%s: fatal drive I/O error, offset %lld for %d bytes\n",
		DRIVE[rqe->driveno].label.name,
		(long long)bio->bio_offset,
		bp->b_bcount);
	    DRIVE[rqe->driveno].lasterror = rq->error;
	    set_drive_state(rqe->driveno,		    /* take the drive down */
		drive_down,
		setstate_force);
	}
    }
    /* Now update the statistics */
    if (bp->b_cmd == BUF_CMD_READ) { 				/* read operation */
	DRIVE[rqe->driveno].reads++;
	DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
	SD[rqe->sdno].reads++;
	SD[rqe->sdno].bytes_read += bp->b_bcount;
	PLEX[rqe->rqg->plexno].reads++;
	PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
	    VOL[PLEX[rqe->rqg->plexno].volno].reads++;
	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
	}
    } else {						    /* write operation */
	DRIVE[rqe->driveno].writes++;
	DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
	SD[rqe->sdno].writes++;
	SD[rqe->sdno].bytes_written += bp->b_bcount;
	PLEX[rqe->rqg->plexno].writes++;
	PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
	    VOL[PLEX[rqe->rqg->plexno].volno].writes++;
	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
	}
    }
    if (rqg->flags & XFR_RECOVERY_READ) {		    /* recovery read, */
	int *sdata;					    /* source */
	int *data;					    /* and group data */
	int length;					    /* and count involved */
	int count;					    /* loop counter */
	struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */

	/* XOR destination is the user data */
	sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT];	/* old data contents */
	data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
	length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */

	for (count = 0; count < length; count++)
	    data[count] ^= sdata[count];

	/*
	 * In a normal read, we will normally read directly
	 * into the user buffer.  This doesn't work if
	 * we're also doing a recovery, so we have to
	 * copy it
	 */
	if (rqe->flags & XFR_NORMAL_READ) {		    /* normal read as well, */
	    char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
	    char *dst;

	    dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
	    length = rqe->datalen << DEV_BSHIFT;	    /* and count involved */
	    bcopy(src, dst, length);			    /* move it */
	}
    } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
    &&(rqg->active == 1))				    /* and this is the last active request */
	complete_raid5_write(rqe);
    /*
     * This is the earliest place where we can be
     * sure that the request has really finished,
     * since complete_raid5_write can issue new
     * requests.
     */
    rqg->active--;					    /* this request now finished */
    if (rqg->active == 0) {				    /* request group finished, */
	rq->active--;					    /* one less */
	if (rqg->lock) {				    /* got a lock? */
	    unlockrange(rqg->plexno, rqg->lock);	    /* yes, free it */
	    rqg->lock = 0;
	}
    }
    if (rq->active == 0) {				    /* request finished, */
#ifdef VINUMDEBUG
	if (debug & DEBUG_RESID) {
	    if (ubio->bio_buf->b_resid != 0)			    /* still something to transfer? */
		Debugger("resid");
	}
#endif

	if (rq->error) {				    /* did we have an error? */
	    if (rq->isplex) {				    /* plex operation, */
		ubio->bio_buf->b_flags |= B_ERROR;	    /* yes, propagate to user */
		ubio->bio_buf->b_error = rq->error;
	    } else {					    /* try to recover */
		di.rq = rq;
		queue_daemon_request(daemonrq_ioerror, di); /* let the daemon complete */
	    }
	} else {
	    ubio->bio_buf->b_resid = 0;			    /* completed our transfer */
	    if (rq->isplex == 0)			    /* volume request, */
		VOL[rq->volplex.volno].active--;	    /* another request finished */
	    biodone(ubio);				    /* top level buffer completed */
	    freerq(rq);					    /* return the request storage */
	}
    }
    rel_mplock();
}
Пример #3
0
/*
 * Start a transfer.  Return -1 on error,
 * 0 if OK, 1 if we need to retry.
 * Parameter reviveok is set when doing
 * transfers for revives: it allows transfers to
 * be started immediately when a revive is in
 * progress.  During revive, normal transfers
 * are queued if they share address space with
 * a currently active revive operation.
 */
int
vinumstart(struct buf *bp, int reviveok)
{
    int plexno;
    int maxplex;					    /* maximum number of plexes to handle */
    struct volume *vol;
    struct request *rq;					    /* build up our request here */
    enum requeststatus status;

#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_user_bp, (union rqinfou) bp, bp);
#endif

    if ((bp->b_bcount % DEV_BSIZE) != 0) {		    /* bad length */
	bp->b_error = EINVAL;				    /* invalid size */
	bp->b_flags |= B_ERROR;
	biodone(bp);
	return -1;
    }
    rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */
    if (rq == NULL) {					    /* can't do it */
	bp->b_error = ENOMEM;				    /* can't get memory */
	bp->b_flags |= B_ERROR;
	biodone(bp);
	return -1;
    }
    bzero(rq, sizeof(struct request));

    /*
     * Note the volume ID.  This can be NULL, which
     * the request building functions use as an
     * indication for single plex I/O
     */
    rq->bp = bp;					    /* and the user buffer struct */

    if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) {	    /* it's a volume, */
	rq->volplex.volno = Volno(bp->b_dev);		    /* get the volume number */
	vol = &VOL[rq->volplex.volno];			    /* and point to it */
	vol->active++;					    /* one more active request */
	maxplex = vol->plexes;				    /* consider all its plexes */
    } else {
	vol = NULL;					    /* no volume */
	rq->volplex.plexno = Plexno(bp->b_dev);		    /* point to the plex */
	rq->isplex = 1;					    /* note that it's a plex */
	maxplex = 1;					    /* just the one plex */
    }

    if (bp->b_flags & B_READ) {
	/*
	 * This is a read request.  Decide
	 * which plex to read from.
	 *
	 * There's a potential race condition here,
	 * since we're not locked, and we could end
	 * up multiply incrementing the round-robin
	 * counter.  This doesn't have any serious
	 * effects, however.
	 */
	if (vol != NULL) {
	    vol->reads++;
	    plexno = vol->preferred_plex;		    /* get the plex to use */
	    if (plexno < 0) {				    /* round robin */
		plexno = vol->last_plex_read;
		vol->last_plex_read++;
		if (vol->last_plex_read >= vol->plexes)	    /* got the the end? */
		    vol->last_plex_read = 0;		    /* wrap around */
	    }
	    status = build_read_request(rq, plexno);	    /* build a request */
	} else {
	    daddr_t diskaddr = bp->b_blkno;		    /* start offset of transfer */
	    status = bre(rq,				    /* build a request list */
		rq->volplex.plexno,
		&diskaddr,
		diskaddr + (bp->b_bcount / DEV_BSIZE));
	}

	if ((status > REQUEST_RECOVERED)		    /* can't satisfy it */
	||(bp->b_flags & B_DONE)) {			    /* XXX shouldn't get this without bad status */
	    if (status == REQUEST_DOWN) {		    /* not enough subdisks */
		bp->b_error = EIO;			    /* I/O error */
		bp->b_flags |= B_ERROR;
	    }
	    biodone(bp);
	    freerq(rq);
	    return -1;
	}
	return launch_requests(rq, reviveok);		    /* now start the requests if we can */
    } else
	/*
	 * This is a write operation.  We write to all plexes.  If this is
	 * a RAID-4 or RAID-5 plex, we must also update the parity stripe.
	 */
    {
	if (vol != NULL) {
	    vol->writes++;
	    status = build_write_request(rq);		    /* Not all the subdisks are up */
	} else {					    /* plex I/O */
	    daddr_t diskstart;

	    diskstart = bp->b_blkno;			    /* start offset of transfer */
	    status = bre(rq,
		Plexno(bp->b_dev),
		&diskstart,
		bp->b_blkno + (bp->b_bcount / DEV_BSIZE));  /* build requests for the plex */
	}
	if ((status > REQUEST_RECOVERED)		    /* can't satisfy it */
	||(bp->b_flags & B_DONE)) {			    /* XXX shouldn't get this without bad status */
	    if (status == REQUEST_DOWN) {		    /* not enough subdisks */
		bp->b_error = EIO;			    /* I/O error */
		bp->b_flags |= B_ERROR;
	    }
	    if ((bp->b_flags & B_DONE) == 0)
		biodone(bp);
	    freerq(rq);
	    return -1;
	}
	return launch_requests(rq, reviveok);		    /* now start the requests if we can */
    }
}