/* * Recover a failed I/O operation. * * The correct way to do this is to examine the request and determine * how to recover each individual failure. In the case of a write, * this could be as simple as doing nothing: the defective drives may * already be down, and there may be nothing else to do. In case of * a read, it will be necessary to retry if there are alternative * copies of the data. * * The easy way (here) is just to reissue the request. This will take * a little longer, but nothing like as long as the failure will have * taken. * */ void recover_io(struct request *rq) { /* * This should read: * * vinumstrategy(rq->bp); * * Negotiate with phk to get it fixed. */ BUF_STRATEGY(rq->bp, 0); /* reissue the command */ }
int physio(dev_t dev, struct uio *uio, int ioflag) { int i; int error; int spl; caddr_t sa; off_t blockno; u_int iolen; struct buf *bp; /* Keep the process UPAGES from being swapped. XXX: why ? */ PHOLD(curproc); bp = getpbuf(NULL); sa = bp->b_data; error = bp->b_error = 0; /* XXX: sanity check */ if(dev->si_iosize_max < PAGE_SIZE) { printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n", devtoname(dev), dev->si_iosize_max); dev->si_iosize_max = DFLTPHYS; } for (i = 0; i < uio->uio_iovcnt; i++) { while (uio->uio_iov[i].iov_len) { if (uio->uio_rw == UIO_READ) bp->b_flags = B_PHYS | B_CALL | B_READ; else bp->b_flags = B_PHYS | B_CALL | B_WRITE; bp->b_dev = dev; bp->b_iodone = physwakeup; bp->b_data = uio->uio_iov[i].iov_base; bp->b_bcount = uio->uio_iov[i].iov_len; bp->b_offset = uio->uio_offset; bp->b_saveaddr = sa; /* Don't exceed drivers iosize limit */ if (bp->b_bcount > dev->si_iosize_max) bp->b_bcount = dev->si_iosize_max; /* * Make sure the pbuf can map the request * XXX: The pbuf has kvasize = MAXPHYS so a request * XXX: larger than MAXPHYS - PAGE_SIZE must be * XXX: page aligned or it will be fragmented. */ iolen = ((vm_offset_t) bp->b_data) & PAGE_MASK; if ((bp->b_bcount + iolen) > bp->b_kvasize) { bp->b_bcount = bp->b_kvasize; if (iolen != 0) bp->b_bcount -= PAGE_SIZE; } bp->b_bufsize = bp->b_bcount; blockno = bp->b_offset >> DEV_BSHIFT; if ((daddr_t)blockno != blockno) { error = EINVAL; /* blockno overflow */ goto doerror; } bp->b_blkno = blockno; if (uio->uio_segflg == UIO_USERSPACE) { if (!useracc(bp->b_data, bp->b_bufsize, bp->b_flags & B_READ ? VM_PROT_WRITE : VM_PROT_READ)) { error = EFAULT; goto doerror; } vmapbuf(bp); } BUF_STRATEGY(bp, 0); spl = splbio(); while ((bp->b_flags & B_DONE) == 0) tsleep((caddr_t)bp, PRIBIO, "physstr", 0); splx(spl); if (uio->uio_segflg == UIO_USERSPACE) vunmapbuf(bp); iolen = bp->b_bcount - bp->b_resid; if (iolen == 0 && !(bp->b_flags & B_ERROR)) goto doerror; /* EOF */ uio->uio_iov[i].iov_len -= iolen; uio->uio_iov[i].iov_base += iolen; uio->uio_resid -= iolen; uio->uio_offset += iolen; if( bp->b_flags & B_ERROR) { error = bp->b_error; goto doerror; } } } doerror: relpbuf(bp, NULL); PRELE(curproc); return (error); }
/* * Call the low-level strategy routines to * perform the requests in a struct request */ int launch_requests(struct request *rq, int reviveok) { int s; struct rqgroup *rqg; int rqno; /* loop index */ struct rqelement *rqe; /* current element */ struct drive *drive; int rcount; /* request count */ /* * First find out whether we're reviving, and the * request contains a conflict. If so, we hang * the request off plex->waitlist of the first * plex we find which is reviving */ if ((rq->flags & XFR_REVIVECONFLICT) /* possible revive conflict */ &&(!reviveok)) { /* and we don't want to do it now, */ struct sd *sd; struct request *waitlist; /* point to the waitlist */ sd = &SD[rq->sdno]; if (sd->waitlist != NULL) { /* something there already, */ waitlist = sd->waitlist; while (waitlist->next != NULL) /* find the end */ waitlist = waitlist->next; waitlist->next = rq; /* hook our request there */ } else sd->waitlist = rq; /* hook our request at the front */ #if VINUMDEBUG if (debug & DEBUG_REVIVECONFLICT) log(LOG_DEBUG, "Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%x, length %ld\n", rq->sdno, rq, rq->bp->b_flags & B_READ ? "Read" : "Write", major(rq->bp->b_dev), minor(rq->bp->b_dev), rq->bp->b_blkno, rq->bp->b_bcount); #endif return 0; /* and get out of here */ } rq->active = 0; /* nothing yet */ #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, "Request: %p\n%s dev %d.%d, offset 0x%x, length %ld\n", rq, rq->bp->b_flags & B_READ ? "Read" : "Write", major(rq->bp->b_dev), minor(rq->bp->b_dev), rq->bp->b_blkno, rq->bp->b_bcount); vinum_conf.lastrq = rq; vinum_conf.lastbuf = rq->bp; if (debug & DEBUG_LASTREQS) logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp); #endif /* * With the division of labour below (first count the requests, then * issue them), it's possible that we don't need this splbio() * protection. But I'll try that some other time. */ s = splbio(); for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */ rqg->active = rqg->count; /* they're all active */ for (rqno = 0; rqno < rqg->count; rqno++) { rqe = &rqg->rqe[rqno]; if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */ rqg->active--; /* one less active request */ } if (rqg->active) /* we have at least one active request, */ rq->active++; /* one more active request group */ } /* Now fire off the requests */ for (rqg = rq->rqg; rqg != NULL;) { /* through the whole request chain */ if (rqg->lockbase >= 0) /* this rqg needs a lock first */ rqg->lock = lockrange(rqg->lockbase, rqg->rq->bp, &PLEX[rqg->plexno]); rcount = rqg->count; for (rqno = 0; rqno < rcount;) { rqe = &rqg->rqe[rqno]; /* * Point to next rqg before the bottom end * changes the structures. */ if (++rqno >= rcount) rqg = rqg->next; if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* this subdisk is good, */ drive = &DRIVE[rqe->driveno]; /* look at drive */ drive->active++; if (drive->active >= drive->maxactive) drive->maxactive = drive->active; vinum_conf.active++; if (vinum_conf.active >= vinum_conf.maxactive) vinum_conf.maxactive = vinum_conf.active; #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", rqe->b.b_flags & B_READ ? "Read" : "Write", major(rqe->b.b_dev), minor(rqe->b.b_dev), rqe->sdno, (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), rqe->b.b_blkno, rqe->b.b_bcount); if (debug & DEBUG_LASTREQS) logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp); #endif /* fire off the request */ BUF_STRATEGY(&rqe->b, 0); } } } splx(s); return 0; }
/* Perform I/O on a subdisk */ void sdio(struct buf *bp) { int s; /* spl */ struct sd *sd; struct sdbuf *sbp; daddr_t endoffset; struct drive *drive; #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdio, (union rqinfou) bp, bp); #endif sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */ drive = &DRIVE[sd->driveno]; if (drive->state != drive_up) { if (sd->state >= sd_crashed) { if (bp->b_flags & B_READ) /* reading, */ set_sd_state(sd->sdno, sd_crashed, setstate_force); else set_sd_state(sd->sdno, sd_stale, setstate_force); } bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } /* * We allow access to any kind of subdisk as long as we can expect * to get the I/O performed. */ if (sd->state < sd_empty) { /* nothing to talk to, */ bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } /* Get a buffer */ sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf)); if (sbp == NULL) { bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return; } bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */ sbp->b.b_flags = bp->b_flags | B_CALL; /* inform us when it's done */ sbp->b.b_bufsize = bp->b_bufsize; /* buffer size */ sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */ sbp->b.b_resid = bp->b_resid; /* and amount waiting */ sbp->b.b_dev = DRIVE[sd->driveno].dev; /* device */ sbp->b.b_data = bp->b_data; /* data buffer */ sbp->b.b_blkno = bp->b_blkno + sd->driveoffset; sbp->b.b_iodone = sdio_done; /* come here on completion */ BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */ BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */ sbp->bp = bp; /* note the address of the original header */ sbp->sdno = sd->sdno; /* note for statistics */ sbp->driveno = sd->driveno; endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */ if (endoffset > sd->sectors) { /* beyond the end */ sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */ if (sbp->b.b_bcount <= 0) { /* nothing to transfer */ bp->b_resid = bp->b_bcount; /* nothing transferred */ biodone(bp); Free(sbp); return; } } #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", sbp->b.b_flags & B_READ ? "Read" : "Write", major(sbp->b.b_dev), minor(sbp->b.b_dev), sbp->sdno, (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset), (int) sbp->b.b_blkno, sbp->b.b_bcount); #endif s = splbio(); #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b); #endif BUF_STRATEGY(&sbp->b, 0); splx(s); }
/* * Write disk label back to device after modification. */ static int l32_writedisklabel(cdev_t dev, struct diskslices *ssp, struct diskslice *sp, disklabel_t lpx) { struct disklabel32 *lp; struct disklabel32 *dlp; struct buf *bp; const char *msg; int error = 0; lp = lpx.lab32; if (lp->d_partitions[RAW_PART].p_offset != 0) return (EXDEV); /* not quite right */ bp = geteblk((int)lp->d_secsize); bp->b_bio1.bio_offset = (off_t)LABELSECTOR32 * lp->d_secsize; bp->b_bio1.bio_done = biodone_sync; bp->b_bio1.bio_flags |= BIO_SYNC; bp->b_bcount = lp->d_secsize; #if 1 /* * We read the label first to see if it's there, * in which case we will put ours at the same offset into the block.. * (I think this is stupid [Julian]) * Note that you can't write a label out over a corrupted label! * (also stupid.. how do you write the first one? by raw writes?) */ bp->b_flags &= ~B_INVAL; bp->b_cmd = BUF_CMD_READ; KKASSERT(dkpart(dev) == WHOLE_SLICE_PART); dev_dstrategy(dev, &bp->b_bio1); error = biowait(&bp->b_bio1, "labrd"); if (error) goto done; for (dlp = (struct disklabel32 *)bp->b_data; dlp <= (struct disklabel32 *) ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp)); dlp = (struct disklabel32 *)((char *)dlp + sizeof(long))) { if (dlp->d_magic == DISKMAGIC32 && dlp->d_magic2 == DISKMAGIC32 && dkcksum32(dlp) == 0) { *dlp = *lp; lpx.lab32 = dlp; msg = l32_fixlabel(NULL, sp, lpx, TRUE); if (msg) { error = EINVAL; } else { bp->b_cmd = BUF_CMD_WRITE; bp->b_bio1.bio_done = biodone_sync; bp->b_bio1.bio_flags |= BIO_SYNC; KKASSERT(dkpart(dev) == WHOLE_SLICE_PART); dev_dstrategy(dev, &bp->b_bio1); error = biowait(&bp->b_bio1, "labwr"); } goto done; } } error = ESRCH; done: #else bzero(bp->b_data, lp->d_secsize); dlp = (struct disklabel32 *)bp->b_data; *dlp = *lp; bp->b_flags &= ~B_INVAL; bp->b_cmd = BUF_CMD_WRITE; bp->b_bio1.bio_done = biodone_sync; bp->b_bio1.bio_flags |= BIO_SYNC; BUF_STRATEGY(bp, 1); error = biowait(&bp->b_bio1, "labwr"); #endif bp->b_flags |= B_INVAL | B_AGE; brelse(bp); return (error); }