/* Unlock a volume and let the next one at it */ void unlockrange(int plexno, struct rangelock *lock) { struct plex *plex; plex = &PLEX[plexno]; #ifdef DIAGNOSTIC if (lock < &plex->lock[0] || lock >= &plex->lock[PLEX_LOCKS]) panic("vinum: rangelock %p on plex %d invalid, not between %p and %p", lock, plexno, &plex->lock[0], &plex->lock[PLEX_LOCKS]); #endif #ifdef VINUMDEBUG if (debug & DEBUG_LOCKREQS) { struct rangelockinfo lockinfo; lockinfo.stripe = lock->stripe; lockinfo.bp = lock->bp; lockinfo.plexno = plex->plexno; logrq(loginfo_lockwait, (union rqinfou) &lockinfo, lock->bp); } #endif lock->stripe = 0; /* no longer used */ plex->usedlocks--; /* one less lock */ if (plex->usedlocks == PLEX_LOCKS - 1) /* we were full, */ wakeup(&plex->usedlocks); /* get a waiter if one's there */ wakeup((void *) lock); }
/* I/O on subdisk completed */ void sdio_done(struct bio *bio) { struct sdbuf *sbp; get_mplock(); sbp = (struct sdbuf *) bio->bio_buf; if (sbp->b.b_flags & B_ERROR) { /* had an error */ sbp->bio->bio_buf->b_flags |= B_ERROR; /* propagate upwards */ sbp->bio->bio_buf->b_error = sbp->b.b_error; } #ifdef VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdiodone, (union rqinfou)bio, bio); #endif sbp->bio->bio_buf->b_resid = sbp->b.b_resid; /* copy the resid field */ /* Now update the statistics */ if (sbp->b.b_cmd == BUF_CMD_READ) { /* read operation */ DRIVE[sbp->driveno].reads++; DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount; SD[sbp->sdno].reads++; SD[sbp->sdno].bytes_read += sbp->b.b_bcount; } else { /* write operation */ DRIVE[sbp->driveno].writes++; DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount; SD[sbp->sdno].writes++; SD[sbp->sdno].bytes_written += sbp->b.b_bcount; } biodone_sync(bio); biodone(sbp->bio); /* complete the caller's I/O */ BUF_UNLOCK(&sbp->b); uninitbufbio(&sbp->b); Free(sbp); rel_mplock(); }
/* Lock a stripe of a plex, wait if it's in use */ struct rangelock * lockrange(daddr_t stripe, struct buf *bp, struct plex *plex) { struct rangelock *lock; struct rangelock *pos; /* position of first free lock */ int foundlocks; /* number of locks found */ /* * We could get by without counting the number * of locks we find, but we have a linear search * through a table which in most cases will be * empty. It's faster to stop when we've found * all the locks that are there. This is also * the reason why we put pos at the beginning * instead of the end, though it requires an * extra test. */ pos = NULL; foundlocks = 0; /* * we can't use 0 as a valid address, so * increment all addresses by 1. */ stripe++; mtx_lock(&plex->lockmtx); /* Wait here if the table is full */ while (plex->usedlocks == PLEX_LOCKS) /* all in use */ msleep(&plex->usedlocks, &plex->lockmtx, PRIBIO, "vlock", 0); #ifdef DIAGNOSTIC if (plex->usedlocks >= PLEX_LOCKS) panic("lockrange: Too many locks in use"); #endif lock = plex->lock; /* pointer in lock table */ if (plex->usedlocks > 0) /* something locked, */ /* Search the lock table for our stripe */ for (; lock < &plex->lock[PLEX_LOCKS] && foundlocks < plex->usedlocks; lock++) { if (lock->stripe) { /* in use */ foundlocks++; /* found another one in use */ if ((lock->stripe == stripe) /* it's our stripe */ &&(lock->bp != bp)) { /* but not our request */ #ifdef VINUMDEBUG if (debug & DEBUG_LOCKREQS) { struct rangelockinfo lockinfo; lockinfo.stripe = stripe; lockinfo.bp = bp; lockinfo.plexno = plex->plexno; logrq(loginfo_lockwait, (union rqinfou) &lockinfo, bp); } #endif plex->lockwaits++; /* waited one more time */ msleep(lock, &plex->lockmtx, PRIBIO, "vrlock", 0); lock = &plex->lock[-1]; /* start again */ foundlocks = 0; pos = NULL; } } else if (pos == NULL) /* still looking for somewhere? */ pos = lock; /* a place to put this one */ } /* * This untidy looking code ensures that we'll * always end up pointing to the first free lock * entry, thus minimizing the number of * iterations necessary. */ if (pos == NULL) /* didn't find one on the way, */ pos = lock; /* use the one we're pointing to */ /* * The address range is free, and we're pointing * to the first unused entry. Make it ours. */ pos->stripe = stripe; pos->bp = bp; plex->usedlocks++; /* one more lock */ mtx_unlock(&plex->lockmtx); #ifdef VINUMDEBUG if (debug & DEBUG_LOCKREQS) { struct rangelockinfo lockinfo; lockinfo.stripe = stripe; lockinfo.bp = bp; lockinfo.plexno = plex->plexno; logrq(loginfo_lock, (union rqinfou) &lockinfo, bp); } #endif return pos; }
/* * Take a completed buffer, transfer the data back if * it's a read, and complete the high-level request * if this is the last subrequest. * * The bp parameter is in fact a struct rqelement, which * includes a couple of extras at the end. */ void complete_rqe(struct bio *bio) { union daemoninfo di; struct buf *bp = bio->bio_buf; struct rqelement *rqe; struct request *rq; struct rqgroup *rqg; struct bio *ubio; /* user buffer */ struct drive *drive; struct sd *sd; char *gravity; /* for error messages */ get_mplock(); rqe = (struct rqelement *) bp; /* point to the element that completed */ rqg = rqe->rqg; /* and the request group */ rq = rqg->rq; /* and the complete request */ ubio = rq->bio; /* user buffer */ #ifdef VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_iodone, (union rqinfou) rqe, ubio); #endif drive = &DRIVE[rqe->driveno]; drive->active--; /* one less outstanding I/O on this drive */ vinum_conf.active--; /* one less outstanding I/O globally */ if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */ ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */ wakeup(&launch_requests); /* let another one at it */ if ((bp->b_flags & B_ERROR) != 0) { /* transfer in error */ gravity = ""; sd = &SD[rqe->sdno]; if (bp->b_error != 0) /* did it return a number? */ rq->error = bp->b_error; /* yes, put it in. */ else if (rq->error == 0) /* no: do we have one already? */ rq->error = EIO; /* no: catchall "I/O error" */ sd->lasterror = rq->error; if (bp->b_cmd == BUF_CMD_READ) { if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { gravity = " fatal"; set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */ } log(LOG_ERR, "%s:%s read error, offset %lld for %d bytes\n", gravity, sd->name, (long long)bio->bio_offset, bp->b_bcount); } else { /* write operation */ if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { gravity = "fatal "; set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */ } log(LOG_ERR, "%s:%s write error, offset %lld for %d bytes\n", gravity, sd->name, (long long)bio->bio_offset, bp->b_bcount); } log(LOG_ERR, "%s: user buffer offset %lld for %d bytes\n", sd->name, (long long)ubio->bio_offset, ubio->bio_buf->b_bcount); if (rq->error == ENXIO) { /* the drive's down too */ log(LOG_ERR, "%s: fatal drive I/O error, offset %lld for %d bytes\n", DRIVE[rqe->driveno].label.name, (long long)bio->bio_offset, bp->b_bcount); DRIVE[rqe->driveno].lasterror = rq->error; set_drive_state(rqe->driveno, /* take the drive down */ drive_down, setstate_force); } } /* Now update the statistics */ if (bp->b_cmd == BUF_CMD_READ) { /* read operation */ DRIVE[rqe->driveno].reads++; DRIVE[rqe->driveno].bytes_read += bp->b_bcount; SD[rqe->sdno].reads++; SD[rqe->sdno].bytes_read += bp->b_bcount; PLEX[rqe->rqg->plexno].reads++; PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount; if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ VOL[PLEX[rqe->rqg->plexno].volno].reads++; VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount; } } else { /* write operation */ DRIVE[rqe->driveno].writes++; DRIVE[rqe->driveno].bytes_written += bp->b_bcount; SD[rqe->sdno].writes++; SD[rqe->sdno].bytes_written += bp->b_bcount; PLEX[rqe->rqg->plexno].writes++; PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ VOL[PLEX[rqe->rqg->plexno].volno].writes++; VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount; } } if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ int *sdata; /* source */ int *data; /* and group data */ int length; /* and count involved */ int count; /* loop counter */ struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ /* XOR destination is the user data */ sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ for (count = 0; count < length; count++) data[count] ^= sdata[count]; /* * In a normal read, we will normally read directly * into the user buffer. This doesn't work if * we're also doing a recovery, so we have to * copy it */ if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ char *dst; dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ length = rqe->datalen << DEV_BSHIFT; /* and count involved */ bcopy(src, dst, length); /* move it */ } } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */ &&(rqg->active == 1)) /* and this is the last active request */ complete_raid5_write(rqe); /* * This is the earliest place where we can be * sure that the request has really finished, * since complete_raid5_write can issue new * requests. */ rqg->active--; /* this request now finished */ if (rqg->active == 0) { /* request group finished, */ rq->active--; /* one less */ if (rqg->lock) { /* got a lock? */ unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ rqg->lock = 0; } } if (rq->active == 0) { /* request finished, */ #ifdef VINUMDEBUG if (debug & DEBUG_RESID) { if (ubio->bio_buf->b_resid != 0) /* still something to transfer? */ Debugger("resid"); } #endif if (rq->error) { /* did we have an error? */ if (rq->isplex) { /* plex operation, */ ubio->bio_buf->b_flags |= B_ERROR; /* yes, propagate to user */ ubio->bio_buf->b_error = rq->error; } else { /* try to recover */ di.rq = rq; queue_daemon_request(daemonrq_ioerror, di); /* let the daemon complete */ } } else { ubio->bio_buf->b_resid = 0; /* completed our transfer */ if (rq->isplex == 0) /* volume request, */ VOL[rq->volplex.volno].active--; /* another request finished */ biodone(ubio); /* top level buffer completed */ freerq(rq); /* return the request storage */ } } rel_mplock(); }
/* Start the second phase of a RAID-4 or RAID-5 group write operation. */ void complete_raid5_write(struct rqelement *rqe) { int *sdata; /* source */ int *pdata; /* and parity block data */ int length; /* and count involved */ int count; /* loop counter */ int rqno; /* request index */ int rqoffset; /* offset of request data from parity data */ struct bio *ubio; /* user buffer header */ struct request *rq; /* pointer to our request */ struct rqgroup *rqg; /* and to the request group */ struct rqelement *prqe; /* point to the parity block */ struct drive *drive; /* drive to access */ rqg = rqe->rqg; /* and to our request group */ rq = rqg->rq; /* point to our request */ ubio = rq->bio; /* user's buffer header */ prqe = &rqg->rqe[0]; /* point to the parity block */ /* * If we get to this function, we have normal or * degraded writes, or a combination of both. We do * the same thing in each case: we perform an * exclusive or to the parity block. The only * difference is the origin of the data and the * address range. */ if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */ pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */ bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */ /* Now get what data we need from each block */ for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ rqe = &rqg->rqe[rqno]; /* this request */ sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */ length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ /* * Add the data block to the parity block. Before * we started the request, we zeroed the parity * block, so the result of adding all the other * blocks and the block we want to write will be * the correct parity block. */ for (count = 0; count < length; count++) pdata[count] ^= sdata[count]; if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */ &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */ Free(rqe->b.b_data); /* free it now */ rqe->flags &= ~XFR_MALLOCED; } } } if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */ /* Get what data we need from each block */ for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ rqe = &rqg->rqe[rqno]; /* this request */ if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE)) == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */ sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */ rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */ pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */ length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ /* * "remove" the old data block * from the parity block */ if ((pdata < ((int *) prqe->b.b_data)) || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount))) || (sdata < ((int *) rqe->b.b_data)) || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount)))) panic("complete_raid5_write: bounds overflow"); for (count = 0; count < length; count++) pdata[count] ^= sdata[count]; /* "add" the new data block */ sdata = (int *) (&ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */ if ((sdata < ((int *) ubio->bio_buf->b_data)) || (&sdata[length] > ((int *) (ubio->bio_buf->b_data + ubio->bio_buf->b_bcount)))) panic("complete_raid5_write: bounds overflow"); for (count = 0; count < length; count++) pdata[count] ^= sdata[count]; /* Free the malloced buffer */ if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */ Free(rqe->b.b_data); /* free it */ rqe->flags &= ~XFR_MALLOCED; } else panic("complete_raid5_write: malloc conflict"); if ((rqe->b.b_cmd == BUF_CMD_READ) /* this was a read */ &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */ rqe->b.b_cmd = BUF_CMD_WRITE; /* we're writing now */ rqe->b.b_bio1.bio_done = complete_rqe; /* by calling us here */ rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */ rqe->b.b_data = &ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */ rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */ rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ rqe->b.b_bio1.bio_offset += (off_t)rqe->dataoffset << DEV_BSHIFT; /* point to the correct block */ drive = &DRIVE[rqe->driveno]; /* drive to access */ rqe->b.b_bio1.bio_driver_info = drive->dev; rqg->active++; /* another active request */ /* We can't sleep here, so we just increment the counters. */ drive->active++; if (drive->active >= drive->maxactive) drive->maxactive = drive->active; vinum_conf.active++; if (vinum_conf.active >= vinum_conf.maxactive) vinum_conf.maxactive = vinum_conf.active; #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %s, sd %d, offset 0x%jx, devoffset 0x%jx, length %d\n", (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write", drive->devicename, rqe->sdno, (uintmax_t)(rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT)), (uintmax_t)rqe->b.b_bio1.bio_offset, rqe->b.b_bcount); if (debug & DEBUG_LASTREQS) logrq(loginfo_raid5_data, (union rqinfou) rqe, ubio); #endif vn_strategy(drive->vp, &rqe->b.b_bio1); } } } } /* Finally, write the parity block */ rqe = &rqg->rqe[0]; rqe->b.b_cmd = BUF_CMD_WRITE; /* we're writing now */ rqe->b.b_bio1.bio_done = complete_rqe; /* by calling us here */ rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */ rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */ rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ drive = &DRIVE[rqe->driveno]; /* drive to access */ rqe->b.b_bio1.bio_driver_info = drive->dev; rqg->active++; /* another active request */ /* We can't sleep here, so we just increment the counters. */ drive->active++; if (drive->active >= drive->maxactive) drive->maxactive = drive->active; vinum_conf.active++; if (vinum_conf.active >= vinum_conf.maxactive) vinum_conf.maxactive = vinum_conf.active; #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %s, sd %d, offset 0x%jx, devoffset 0x%jx, length %d\n", (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write", drive->devicename, rqe->sdno, (uintmax_t)(rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT)), (uintmax_t)rqe->b.b_bio1.bio_offset, rqe->b.b_bcount); if (debug & DEBUG_LASTREQS) logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubio); #endif vn_strategy(drive->vp, &rqe->b.b_bio1); }
/* Perform I/O on a subdisk */ void sdio(struct buf *bp) { int s; /* spl */ struct sd *sd; struct sdbuf *sbp; daddr_t endoffset; struct drive *drive; #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdio, (union rqinfou) bp, bp); #endif sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */ drive = &DRIVE[sd->driveno]; if (drive->state != drive_up) { if (sd->state >= sd_crashed) { if (bp->b_flags & B_READ) /* reading, */ set_sd_state(sd->sdno, sd_crashed, setstate_force); else set_sd_state(sd->sdno, sd_stale, setstate_force); } bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } /* * We allow access to any kind of subdisk as long as we can expect * to get the I/O performed. */ if (sd->state < sd_empty) { /* nothing to talk to, */ bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } /* Get a buffer */ sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf)); if (sbp == NULL) { bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return; } bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */ sbp->b.b_flags = bp->b_flags | B_CALL; /* inform us when it's done */ sbp->b.b_bufsize = bp->b_bufsize; /* buffer size */ sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */ sbp->b.b_resid = bp->b_resid; /* and amount waiting */ sbp->b.b_dev = DRIVE[sd->driveno].dev; /* device */ sbp->b.b_data = bp->b_data; /* data buffer */ sbp->b.b_blkno = bp->b_blkno + sd->driveoffset; sbp->b.b_iodone = sdio_done; /* come here on completion */ BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */ BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */ sbp->bp = bp; /* note the address of the original header */ sbp->sdno = sd->sdno; /* note for statistics */ sbp->driveno = sd->driveno; endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */ if (endoffset > sd->sectors) { /* beyond the end */ sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */ if (sbp->b.b_bcount <= 0) { /* nothing to transfer */ bp->b_resid = bp->b_bcount; /* nothing transferred */ biodone(bp); Free(sbp); return; } } #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", sbp->b.b_flags & B_READ ? "Read" : "Write", major(sbp->b.b_dev), minor(sbp->b.b_dev), sbp->sdno, (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset), (int) sbp->b.b_blkno, sbp->b.b_bcount); #endif s = splbio(); #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b); #endif BUF_STRATEGY(&sbp->b, 0); splx(s); }
/* * Call the low-level strategy routines to * perform the requests in a struct request */ int launch_requests(struct request *rq, int reviveok) { int s; struct rqgroup *rqg; int rqno; /* loop index */ struct rqelement *rqe; /* current element */ struct drive *drive; int rcount; /* request count */ /* * First find out whether we're reviving, and the * request contains a conflict. If so, we hang * the request off plex->waitlist of the first * plex we find which is reviving */ if ((rq->flags & XFR_REVIVECONFLICT) /* possible revive conflict */ &&(!reviveok)) { /* and we don't want to do it now, */ struct sd *sd; struct request *waitlist; /* point to the waitlist */ sd = &SD[rq->sdno]; if (sd->waitlist != NULL) { /* something there already, */ waitlist = sd->waitlist; while (waitlist->next != NULL) /* find the end */ waitlist = waitlist->next; waitlist->next = rq; /* hook our request there */ } else sd->waitlist = rq; /* hook our request at the front */ #if VINUMDEBUG if (debug & DEBUG_REVIVECONFLICT) log(LOG_DEBUG, "Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%x, length %ld\n", rq->sdno, rq, rq->bp->b_flags & B_READ ? "Read" : "Write", major(rq->bp->b_dev), minor(rq->bp->b_dev), rq->bp->b_blkno, rq->bp->b_bcount); #endif return 0; /* and get out of here */ } rq->active = 0; /* nothing yet */ #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, "Request: %p\n%s dev %d.%d, offset 0x%x, length %ld\n", rq, rq->bp->b_flags & B_READ ? "Read" : "Write", major(rq->bp->b_dev), minor(rq->bp->b_dev), rq->bp->b_blkno, rq->bp->b_bcount); vinum_conf.lastrq = rq; vinum_conf.lastbuf = rq->bp; if (debug & DEBUG_LASTREQS) logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp); #endif /* * With the division of labour below (first count the requests, then * issue them), it's possible that we don't need this splbio() * protection. But I'll try that some other time. */ s = splbio(); for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */ rqg->active = rqg->count; /* they're all active */ for (rqno = 0; rqno < rqg->count; rqno++) { rqe = &rqg->rqe[rqno]; if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */ rqg->active--; /* one less active request */ } if (rqg->active) /* we have at least one active request, */ rq->active++; /* one more active request group */ } /* Now fire off the requests */ for (rqg = rq->rqg; rqg != NULL;) { /* through the whole request chain */ if (rqg->lockbase >= 0) /* this rqg needs a lock first */ rqg->lock = lockrange(rqg->lockbase, rqg->rq->bp, &PLEX[rqg->plexno]); rcount = rqg->count; for (rqno = 0; rqno < rcount;) { rqe = &rqg->rqe[rqno]; /* * Point to next rqg before the bottom end * changes the structures. */ if (++rqno >= rcount) rqg = rqg->next; if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* this subdisk is good, */ drive = &DRIVE[rqe->driveno]; /* look at drive */ drive->active++; if (drive->active >= drive->maxactive) drive->maxactive = drive->active; vinum_conf.active++; if (vinum_conf.active >= vinum_conf.maxactive) vinum_conf.maxactive = vinum_conf.active; #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", rqe->b.b_flags & B_READ ? "Read" : "Write", major(rqe->b.b_dev), minor(rqe->b.b_dev), rqe->sdno, (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), rqe->b.b_blkno, rqe->b.b_bcount); if (debug & DEBUG_LASTREQS) logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp); #endif /* fire off the request */ BUF_STRATEGY(&rqe->b, 0); } } } splx(s); return 0; }
/* * Start a transfer. Return -1 on error, * 0 if OK, 1 if we need to retry. * Parameter reviveok is set when doing * transfers for revives: it allows transfers to * be started immediately when a revive is in * progress. During revive, normal transfers * are queued if they share address space with * a currently active revive operation. */ int vinumstart(struct buf *bp, int reviveok) { int plexno; int maxplex; /* maximum number of plexes to handle */ struct volume *vol; struct request *rq; /* build up our request here */ enum requeststatus status; #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_user_bp, (union rqinfou) bp, bp); #endif if ((bp->b_bcount % DEV_BSIZE) != 0) { /* bad length */ bp->b_error = EINVAL; /* invalid size */ bp->b_flags |= B_ERROR; biodone(bp); return -1; } rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */ if (rq == NULL) { /* can't do it */ bp->b_error = ENOMEM; /* can't get memory */ bp->b_flags |= B_ERROR; biodone(bp); return -1; } bzero(rq, sizeof(struct request)); /* * Note the volume ID. This can be NULL, which * the request building functions use as an * indication for single plex I/O */ rq->bp = bp; /* and the user buffer struct */ if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) { /* it's a volume, */ rq->volplex.volno = Volno(bp->b_dev); /* get the volume number */ vol = &VOL[rq->volplex.volno]; /* and point to it */ vol->active++; /* one more active request */ maxplex = vol->plexes; /* consider all its plexes */ } else { vol = NULL; /* no volume */ rq->volplex.plexno = Plexno(bp->b_dev); /* point to the plex */ rq->isplex = 1; /* note that it's a plex */ maxplex = 1; /* just the one plex */ } if (bp->b_flags & B_READ) { /* * This is a read request. Decide * which plex to read from. * * There's a potential race condition here, * since we're not locked, and we could end * up multiply incrementing the round-robin * counter. This doesn't have any serious * effects, however. */ if (vol != NULL) { vol->reads++; plexno = vol->preferred_plex; /* get the plex to use */ if (plexno < 0) { /* round robin */ plexno = vol->last_plex_read; vol->last_plex_read++; if (vol->last_plex_read >= vol->plexes) /* got the the end? */ vol->last_plex_read = 0; /* wrap around */ } status = build_read_request(rq, plexno); /* build a request */ } else { daddr_t diskaddr = bp->b_blkno; /* start offset of transfer */ status = bre(rq, /* build a request list */ rq->volplex.plexno, &diskaddr, diskaddr + (bp->b_bcount / DEV_BSIZE)); } if ((status > REQUEST_RECOVERED) /* can't satisfy it */ ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */ if (status == REQUEST_DOWN) { /* not enough subdisks */ bp->b_error = EIO; /* I/O error */ bp->b_flags |= B_ERROR; } biodone(bp); freerq(rq); return -1; } return launch_requests(rq, reviveok); /* now start the requests if we can */ } else /* * This is a write operation. We write to all plexes. If this is * a RAID-4 or RAID-5 plex, we must also update the parity stripe. */ { if (vol != NULL) { vol->writes++; status = build_write_request(rq); /* Not all the subdisks are up */ } else { /* plex I/O */ daddr_t diskstart; diskstart = bp->b_blkno; /* start offset of transfer */ status = bre(rq, Plexno(bp->b_dev), &diskstart, bp->b_blkno + (bp->b_bcount / DEV_BSIZE)); /* build requests for the plex */ } if ((status > REQUEST_RECOVERED) /* can't satisfy it */ ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */ if (status == REQUEST_DOWN) { /* not enough subdisks */ bp->b_error = EIO; /* I/O error */ bp->b_flags |= B_ERROR; } if ((bp->b_flags & B_DONE) == 0) biodone(bp); freerq(rq); return -1; } return launch_requests(rq, reviveok); /* now start the requests if we can */ } }
/* Lock a stripe of a plex, wait if it's in use */ struct rangelock * lockrange(daddr_t stripe, struct buf *bp, struct plex *plex) { int s; struct rangelock *lock; struct rangelock *pos; /* position of first free lock */ int foundlocks; /* number of locks found */ int newlock; /* * We could get by without counting the number * of locks we find, but we have a linear search * through a table which in most cases will be * empty. It's faster to stop when we've found * all the locks that are there. This is also * the reason why we put pos at the beginning * instead of the end, though it requires an * extra test. */ pos = NULL; foundlocks = 0; /* * we can't use 0 as a valid address, so * increment all addresses by 1. */ stripe++; /* * We give the locks back from an interrupt * context, so we need to raise the spl here. */ s = splbio(); /* Search the lock table for our stripe */ for (lock = plex->lock; lock < &plex->lock[plex->alloclocks] && foundlocks < plex->usedlocks; lock++) { if (lock->stripe) { /* in use */ foundlocks++; /* found another one in use */ if ((lock->stripe == stripe) /* it's our stripe */ &&(lock->plexno == plex->plexno) /* and our plex */ &&(lock->bp != bp)) { /* but not our request */ /* * It would be nice to sleep on the lock * itself, but it could get moved if the * table expands during the wait. Wait on * the lock address + 1 (since waiting on * 0 isn't allowed) instead. It isn't * exactly unique, but we won't have many * conflicts. The worst effect of a * conflict would be an additional * schedule and time through this loop. */ #ifdef VINUMDEBUG if (debug & DEBUG_LASTREQS) { struct rangelock info; info.stripe = stripe; info.bp = bp; info.plexno = plex->plexno; logrq(loginfo_lockwait, (union rqinfou) &info, bp); } #endif plex->lockwaits++; /* waited one more time */ while (lock->stripe) /* wait for it to become free */ tsleep((void *) lock->stripe, PRIBIO, "vrlock", 2 * hz); break; /* out of the inner level loop */ } } else { if (pos == NULL) /* still looking for somewhere? */ pos = lock; /* a place to put this one */ } } /* * The address range is free. Add our lock * entry. */ if (pos == NULL) { /* Didn't find an entry */ if (foundlocks >= plex->alloclocks) { /* searched the lot, */ newlock = plex->alloclocks; EXPAND(plex->lock, struct rangelock, plex->alloclocks, INITIAL_LOCKS); pos = &plex->lock[newlock]; while (newlock < plex->alloclocks) plex->lock[newlock++].stripe = 0; } else