/* * Abort a request: free resources and complete the * user request with the specified error */ int abortrequest(struct request *rq, int error) { struct buf *bp = rq->bp; /* user buffer */ bp->b_flags |= B_ERROR; bp->b_error = error; freerq(rq); /* free everything we're doing */ biodone(bp); return error; /* and give up */ }
/* * Take a completed buffer, transfer the data back if * it's a read, and complete the high-level request * if this is the last subrequest. * * The bp parameter is in fact a struct rqelement, which * includes a couple of extras at the end. */ void complete_rqe(struct bio *bio) { union daemoninfo di; struct buf *bp = bio->bio_buf; struct rqelement *rqe; struct request *rq; struct rqgroup *rqg; struct bio *ubio; /* user buffer */ struct drive *drive; struct sd *sd; char *gravity; /* for error messages */ get_mplock(); rqe = (struct rqelement *) bp; /* point to the element that completed */ rqg = rqe->rqg; /* and the request group */ rq = rqg->rq; /* and the complete request */ ubio = rq->bio; /* user buffer */ #ifdef VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_iodone, (union rqinfou) rqe, ubio); #endif drive = &DRIVE[rqe->driveno]; drive->active--; /* one less outstanding I/O on this drive */ vinum_conf.active--; /* one less outstanding I/O globally */ if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */ ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */ wakeup(&launch_requests); /* let another one at it */ if ((bp->b_flags & B_ERROR) != 0) { /* transfer in error */ gravity = ""; sd = &SD[rqe->sdno]; if (bp->b_error != 0) /* did it return a number? */ rq->error = bp->b_error; /* yes, put it in. */ else if (rq->error == 0) /* no: do we have one already? */ rq->error = EIO; /* no: catchall "I/O error" */ sd->lasterror = rq->error; if (bp->b_cmd == BUF_CMD_READ) { if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { gravity = " fatal"; set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */ } log(LOG_ERR, "%s:%s read error, offset %lld for %d bytes\n", gravity, sd->name, (long long)bio->bio_offset, bp->b_bcount); } else { /* write operation */ if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { gravity = "fatal "; set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */ } log(LOG_ERR, "%s:%s write error, offset %lld for %d bytes\n", gravity, sd->name, (long long)bio->bio_offset, bp->b_bcount); } log(LOG_ERR, "%s: user buffer offset %lld for %d bytes\n", sd->name, (long long)ubio->bio_offset, ubio->bio_buf->b_bcount); if (rq->error == ENXIO) { /* the drive's down too */ log(LOG_ERR, "%s: fatal drive I/O error, offset %lld for %d bytes\n", DRIVE[rqe->driveno].label.name, (long long)bio->bio_offset, bp->b_bcount); DRIVE[rqe->driveno].lasterror = rq->error; set_drive_state(rqe->driveno, /* take the drive down */ drive_down, setstate_force); } } /* Now update the statistics */ if (bp->b_cmd == BUF_CMD_READ) { /* read operation */ DRIVE[rqe->driveno].reads++; DRIVE[rqe->driveno].bytes_read += bp->b_bcount; SD[rqe->sdno].reads++; SD[rqe->sdno].bytes_read += bp->b_bcount; PLEX[rqe->rqg->plexno].reads++; PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount; if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ VOL[PLEX[rqe->rqg->plexno].volno].reads++; VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount; } } else { /* write operation */ DRIVE[rqe->driveno].writes++; DRIVE[rqe->driveno].bytes_written += bp->b_bcount; SD[rqe->sdno].writes++; SD[rqe->sdno].bytes_written += bp->b_bcount; PLEX[rqe->rqg->plexno].writes++; PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ VOL[PLEX[rqe->rqg->plexno].volno].writes++; VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount; } } if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ int *sdata; /* source */ int *data; /* and group data */ int length; /* and count involved */ int count; /* loop counter */ struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ /* XOR destination is the user data */ sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ for (count = 0; count < length; count++) data[count] ^= sdata[count]; /* * In a normal read, we will normally read directly * into the user buffer. This doesn't work if * we're also doing a recovery, so we have to * copy it */ if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ char *dst; dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ length = rqe->datalen << DEV_BSHIFT; /* and count involved */ bcopy(src, dst, length); /* move it */ } } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */ &&(rqg->active == 1)) /* and this is the last active request */ complete_raid5_write(rqe); /* * This is the earliest place where we can be * sure that the request has really finished, * since complete_raid5_write can issue new * requests. */ rqg->active--; /* this request now finished */ if (rqg->active == 0) { /* request group finished, */ rq->active--; /* one less */ if (rqg->lock) { /* got a lock? */ unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ rqg->lock = 0; } } if (rq->active == 0) { /* request finished, */ #ifdef VINUMDEBUG if (debug & DEBUG_RESID) { if (ubio->bio_buf->b_resid != 0) /* still something to transfer? */ Debugger("resid"); } #endif if (rq->error) { /* did we have an error? */ if (rq->isplex) { /* plex operation, */ ubio->bio_buf->b_flags |= B_ERROR; /* yes, propagate to user */ ubio->bio_buf->b_error = rq->error; } else { /* try to recover */ di.rq = rq; queue_daemon_request(daemonrq_ioerror, di); /* let the daemon complete */ } } else { ubio->bio_buf->b_resid = 0; /* completed our transfer */ if (rq->isplex == 0) /* volume request, */ VOL[rq->volplex.volno].active--; /* another request finished */ biodone(ubio); /* top level buffer completed */ freerq(rq); /* return the request storage */ } } rel_mplock(); }
/* * Start a transfer. Return -1 on error, * 0 if OK, 1 if we need to retry. * Parameter reviveok is set when doing * transfers for revives: it allows transfers to * be started immediately when a revive is in * progress. During revive, normal transfers * are queued if they share address space with * a currently active revive operation. */ int vinumstart(struct buf *bp, int reviveok) { int plexno; int maxplex; /* maximum number of plexes to handle */ struct volume *vol; struct request *rq; /* build up our request here */ enum requeststatus status; #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_user_bp, (union rqinfou) bp, bp); #endif if ((bp->b_bcount % DEV_BSIZE) != 0) { /* bad length */ bp->b_error = EINVAL; /* invalid size */ bp->b_flags |= B_ERROR; biodone(bp); return -1; } rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */ if (rq == NULL) { /* can't do it */ bp->b_error = ENOMEM; /* can't get memory */ bp->b_flags |= B_ERROR; biodone(bp); return -1; } bzero(rq, sizeof(struct request)); /* * Note the volume ID. This can be NULL, which * the request building functions use as an * indication for single plex I/O */ rq->bp = bp; /* and the user buffer struct */ if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) { /* it's a volume, */ rq->volplex.volno = Volno(bp->b_dev); /* get the volume number */ vol = &VOL[rq->volplex.volno]; /* and point to it */ vol->active++; /* one more active request */ maxplex = vol->plexes; /* consider all its plexes */ } else { vol = NULL; /* no volume */ rq->volplex.plexno = Plexno(bp->b_dev); /* point to the plex */ rq->isplex = 1; /* note that it's a plex */ maxplex = 1; /* just the one plex */ } if (bp->b_flags & B_READ) { /* * This is a read request. Decide * which plex to read from. * * There's a potential race condition here, * since we're not locked, and we could end * up multiply incrementing the round-robin * counter. This doesn't have any serious * effects, however. */ if (vol != NULL) { vol->reads++; plexno = vol->preferred_plex; /* get the plex to use */ if (plexno < 0) { /* round robin */ plexno = vol->last_plex_read; vol->last_plex_read++; if (vol->last_plex_read >= vol->plexes) /* got the the end? */ vol->last_plex_read = 0; /* wrap around */ } status = build_read_request(rq, plexno); /* build a request */ } else { daddr_t diskaddr = bp->b_blkno; /* start offset of transfer */ status = bre(rq, /* build a request list */ rq->volplex.plexno, &diskaddr, diskaddr + (bp->b_bcount / DEV_BSIZE)); } if ((status > REQUEST_RECOVERED) /* can't satisfy it */ ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */ if (status == REQUEST_DOWN) { /* not enough subdisks */ bp->b_error = EIO; /* I/O error */ bp->b_flags |= B_ERROR; } biodone(bp); freerq(rq); return -1; } return launch_requests(rq, reviveok); /* now start the requests if we can */ } else /* * This is a write operation. We write to all plexes. If this is * a RAID-4 or RAID-5 plex, we must also update the parity stripe. */ { if (vol != NULL) { vol->writes++; status = build_write_request(rq); /* Not all the subdisks are up */ } else { /* plex I/O */ daddr_t diskstart; diskstart = bp->b_blkno; /* start offset of transfer */ status = bre(rq, Plexno(bp->b_dev), &diskstart, bp->b_blkno + (bp->b_bcount / DEV_BSIZE)); /* build requests for the plex */ } if ((status > REQUEST_RECOVERED) /* can't satisfy it */ ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */ if (status == REQUEST_DOWN) { /* not enough subdisks */ bp->b_error = EIO; /* I/O error */ bp->b_flags |= B_ERROR; } if ((bp->b_flags & B_DONE) == 0) biodone(bp); freerq(rq); return -1; } return launch_requests(rq, reviveok); /* now start the requests if we can */ } }