void moveobject(struct vinum_ioctl_msg *msg) { struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; struct drive *drive; struct sd *sd; /* Check that our objects are valid (i.e. they exist) */ drive = validdrive(msg->index, (struct _ioctl_reply *) msg); if (drive == NULL) return; sd = validsd(msg->otherobject, (struct _ioctl_reply *) msg); if (sd == NULL) return; if (sd->driveno == msg->index) /* sd already belongs to drive */ return; if (sd->state > sd_stale) set_sd_state(sd->sdno, sd_stale, setstate_force); /* make the subdisk stale */ else sd->state = sd_empty; if (sd->plexno >= 0) /* part of a plex, */ update_plex_state(sd->plexno); /* update its state */ /* Return the space on the old drive */ if ((sd->driveno >= 0) /* we have a drive, */ &&(sd->sectors > 0)) /* and some space on it */ return_drive_space(sd->driveno, /* return the space */ sd->driveoffset, sd->sectors); /* Reassign the old subdisk */ sd->driveno = msg->index; sd->driveoffset = -1; /* let the drive decide where to put us */ give_sd_to_drive(sd->sdno); reply->error = 0; }
/* * define the low-level requests needed to perform * a high-level I/O operation for a specific plex * 'plexno'. * * Return 0 if all subdisks involved in the * request are up, 1 if some subdisks are not up, * and -1 if the request is at least partially * outside the bounds of the subdisks. * * Modify the pointer *diskstart to point to the * end address. On read, return on the first bad * subdisk, so that the caller * (build_read_request) can try alternatives. * * On entry to this routine, the prq structures * are not assigned. The assignment is performed * by expandrq(). Strictly speaking, the elements * rqe->sdno of all entries should be set to -1, * since 0 (from bzero) is a valid subdisk number. * We avoid this problem by initializing the ones * we use, and not looking at the others (index >= * prq->requests). */ enum requeststatus bre5(struct request *rq, int plexno, daddr_t * diskaddr, daddr_t diskend) { struct metrics m; /* most of the information */ struct sd *sd; struct plex *plex; struct buf *bp; /* user's bp */ struct rqgroup *rqg; /* the request group that we will create */ struct rqelement *rqe; /* point to this request information */ int rsectors; /* sectors remaining in this stripe */ int mysdno; /* another sd index in loops */ int rqno; /* request number */ rqg = NULL; /* shut up, damn compiler */ m.diskstart = *diskaddr; /* start of transfer */ bp = rq->bp; /* buffer pointer */ plex = &PLEX[plexno]; /* point to the plex */ while (*diskaddr < diskend) { /* until we get it all sorted out */ if (*diskaddr >= plex->length) /* beyond the end of the plex */ return REQUEST_EOF; /* can't continue */ m.badsdno = -1; /* no bad subdisk yet */ /* Part A: Define the request */ /* * First, calculate some sizes: * The offset of the start address from * the start of the stripe. */ m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1)); /* * The plex-relative address of the * start of the stripe. */ m.stripebase = *diskaddr - m.stripeoffset; /* subdisk containing the parity stripe */ if (plex->organization == plex_raid5) m.psdno = plex->subdisks - 1 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1))) % plex->subdisks; else /* RAID-4 */ m.psdno = plex->subdisks - 1; /* * The number of the subdisk in which * the start is located. */ m.firstsdno = m.stripeoffset / plex->stripesize; if (m.firstsdno >= m.psdno) /* at or past parity sd */ m.firstsdno++; /* increment it */ /* * The offset from the beginning of * the stripe on this subdisk. */ m.initoffset = m.stripeoffset % plex->stripesize; /* The offset of the stripe start relative to this subdisk */ m.sdbase = m.stripebase / (plex->subdisks - 1); m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */ /* * The number of sectors to transfer in the * current (first) subdisk. */ m.initlen = min(diskend - *diskaddr, /* the amount remaining to transfer */ plex->stripesize - m.initoffset); /* and the amount left in this block */ /* * The number of sectors to transfer in this stripe * is the minumum of the amount remaining to transfer * and the amount left in this stripe. */ m.stripesectors = min(diskend - *diskaddr, plex->stripesize * (plex->subdisks - 1) - m.stripeoffset); /* The number of data subdisks involved in this request */ m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize; /* Part B: decide what kind of transfer this will be. * start and end addresses of the transfer in * the current block. * * There are a number of different kinds of * transfer, each of which relates to a * specific subdisk: * * 1. Normal read. All participating subdisks * are up, and the transfer can be made * directly to the user buffer. The bounds * of the transfer are described by * m.dataoffset and m.datalen. We have * already calculated m.initoffset and * m.initlen, which define the parameters * for the first data block. * * 2. Recovery read. One participating * subdisk is down. To recover data, all * the other subdisks, including the parity * subdisk, must be read. The data is * recovered by exclusive-oring all the * other blocks. The bounds of the * transfer are described by m.groupoffset * and m.grouplen. * * 3. A read request may request reading both * available data (normal read) and * non-available data (recovery read). * This can be a problem if the address * ranges of the two reads do not coincide: * in this case, the normal read needs to * be extended to cover the address range * of the recovery read, and must thus be * performed out of malloced memory. * * 4. Normal write. All the participating * subdisks are up. The bounds of the * transfer are described by m.dataoffset * and m.datalen. Since these values * differ for each block, we calculate the * bounds for the parity block * independently as the maximum of the * individual blocks and store these values * in m.writeoffset and m.writelen. This * write proceeds in four phases: * * i. Read the old contents of each block * and the parity block. * ii. ``Remove'' the old contents from * the parity block with exclusive or. * iii. ``Insert'' the new contents of the * block in the parity block, again * with exclusive or. * * iv. Write the new contents of the data * blocks and the parity block. The data * block transfers can be made directly from * the user buffer. * * 5. Degraded write where the data block is * not available. The bounds of the * transfer are described by m.groupoffset * and m.grouplen. This requires the * following steps: * * i. Read in all the other data blocks, * excluding the parity block. * * ii. Recreate the parity block from the * other data blocks and the data to be * written. * * iii. Write the parity block. * * 6. Parityless write, a write where the * parity block is not available. This is * in fact the simplest: just write the * data blocks. This can proceed directly * from the user buffer. The bounds of the * transfer are described by m.dataoffset * and m.datalen. * * 7. Combination of degraded data block write * and normal write. In this case the * address ranges of the reads may also * need to be extended to cover all * participating blocks. * * All requests in a group transfer transfer * the same address range relative to their * subdisk. The individual transfers may * vary, but since our group of requests is * all in a single slice, we can define a * range in which they all fall. * * In the following code section, we determine * which kind of transfer we will perform. If * there is a group transfer, we also decide * its bounds relative to the subdisks. At * the end, we have the following values: * * m.flags indicates the kinds of transfers * we will perform. * m.initoffset indicates the offset of the * beginning of any data operation relative * to the beginning of the stripe base. * m.initlen specifies the length of any data * operation. * m.dataoffset contains the same value as * m.initoffset. * m.datalen contains the same value as * m.initlen. Initially dataoffset and * datalen describe the parameters for the * first data block; while building the data * block requests, they are updated for each * block. * m.groupoffset indicates the offset of any * group operation relative to the beginning * of the stripe base. * m.grouplen specifies the length of any * group operation. * m.writeoffset indicates the offset of a * normal write relative to the beginning of * the stripe base. This value differs from * m.dataoffset in that it applies to the * entire operation, and not just the first * block. * m.writelen specifies the total span of a * normal write operation. writeoffset and * writelen are used to define the parity * block. */ m.groupoffset = 0; /* assume no group... */ m.grouplen = 0; /* until we know we have one */ m.writeoffset = m.initoffset; /* start offset of transfer */ m.writelen = 0; /* nothing to write yet */ m.flags = 0; /* no flags yet */ rsectors = m.stripesectors; /* remaining sectors to examine */ m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ m.datalen = m.initlen; if (m.sdcount > 1) { plex->multiblock++; /* more than one block for the request */ /* * If we have two transfers that don't overlap, * (one at the end of the first block, the other * at the beginning of the second block), * it's cheaper to split them. */ if (rsectors < plex->stripesize) { m.sdcount = 1; /* just one subdisk */ m.stripesectors = m.initlen; /* and just this many sectors */ rsectors = m.initlen; /* and in the loop counter */ } } if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */ m.badsdno = m.psdno; /* note that it's down */ if (bp->b_flags & B_READ) { /* read operation */ for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { if (mysdno == m.psdno) /* ignore parity on read */ mysdno++; if (mysdno == plex->subdisks) /* wraparound */ mysdno = 0; if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */ if (m.badsdno >= 0) /* we had one already, */ return REQUEST_DOWN; /* we can't take a second */ m.badsdno = mysdno; /* got the first */ m.groupoffset = m.dataoffset; /* define the bounds */ m.grouplen = m.datalen; m.flags |= XFR_RECOVERY_READ; /* we need recovery */ plex->recovered_reads++; /* count another one */ } else m.flags |= XFR_NORMAL_READ; /* normal read */ /* Update the pointers for the next block */ m.dataoffset = 0; /* back to the start of the stripe */ rsectors -= m.datalen; /* remaining sectors to examine */ m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */ } } else { /* write operation */ for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { if (mysdno == m.psdno) /* parity stripe, we've dealt with that */ mysdno++; if (mysdno == plex->subdisks) /* wraparound */ mysdno = 0; if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ sd = &SD[plex->sdnos[mysdno]]; if (sd->state != sd_up) { enum requeststatus s; s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ if (s && (m.badsdno >= 0)) { /* second bad disk, */ int sdno; /* * If the parity disk is down, there's * no recovery. We make all involved * subdisks stale. Otherwise, we * should be able to recover, but it's * like pulling teeth. Fix it later. */ for (sdno = 0; sdno < m.sdcount; sdno++) { struct sd *sd = &SD[plex->sdnos[sdno]]; if (sd->state >= sd_reborn) /* sort of up, */ set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */ } return s; /* and crap out */ } m.badsdno = mysdno; /* note which one is bad */ m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */ plex->degraded_writes++; /* count another one */ m.groupoffset = m.dataoffset; /* define the bounds */ m.grouplen = m.datalen; } else { m.flags |= XFR_NORMAL_WRITE; /* normal write operation */ if (m.writeoffset > m.dataoffset) { /* move write operation lower */ m.writelen = max(m.writeoffset + m.writelen, m.dataoffset + m.datalen) - m.dataoffset; m.writeoffset = m.dataoffset; } else m.writelen = max(m.writeoffset + m.writelen, m.dataoffset + m.datalen) - m.writeoffset; } /* Update the pointers for the next block */ m.dataoffset = 0; /* back to the start of the stripe */ rsectors -= m.datalen; /* remaining sectors to examine */ m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */ } if (m.badsdno == m.psdno) { /* got a bad parity block, */ struct sd *psd = &SD[plex->sdnos[m.psdno]]; if (psd->state == sd_down) set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */ else if (psd->state == sd_crashed) set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */ m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */ m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */ plex->parityless_writes++; /* count another one */ } } /* reset the initial transfer values */ m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ m.datalen = m.initlen; /* decide how many requests we need */ if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)) /* doing a recovery read or degraded write, */ m.rqcount = plex->subdisks; /* all subdisks */ else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */ m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */ else /* parityless write or normal read */ m.rqcount = m.sdcount; /* just the data blocks */ /* Part C: build the requests */ rqg = allocrqg(rq, m.rqcount); /* get a request group */ if (rqg == NULL) { /* malloc failed */ bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return REQUEST_ENOMEM; } rqg->plexno = plexno; rqg->flags = m.flags; rqno = 0; /* index in the request group */ /* 1: PARITY BLOCK */ /* * Are we performing an operation which requires parity? In that case, * work out the parameters and define the parity block. * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE */ if (m.flags & XFR_PARITYOP) { /* need parity */ rqe = &rqg->rqe[rqno]; /* point to element */ sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */ rqe->rqg = rqg; /* point back to group */ rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */ &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */ setrqebounds(rqe, &m); /* set up the bounds of the transfer */ rqe->sdno = sd->sdno; /* subdisk number */ rqe->driveno = sd->driveno; if (build_rq_buffer(rqe, plex)) /* build the buffer */ return REQUEST_ENOMEM; /* can't do it */ rqe->b.b_flags |= B_READ; /* we must read first */ m.sdcount++; /* adjust the subdisk count */ rqno++; /* and point to the next request */ } /* * 2: DATA BLOCKS * Now build up requests for the blocks required * for individual transfers */ for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) { if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ if (mysdno == plex->subdisks) /* got to the end, */ mysdno = 0; /* wrap around */ if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ rqe = &rqg->rqe[rqno]; /* point to element */ sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ rqe->rqg = rqg; /* point to group */ if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */ rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */ else rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */ if (mysdno == m.badsdno) { /* this is the bad subdisk */ rqg->badsdno = rqno; /* note which one */ rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */ /* * we can't read or write from/to it, * but we don't need to malloc */ rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE); } setrqebounds(rqe, &m); /* set up the bounds of the transfer */ rqe->useroffset = m.useroffset; /* offset in user buffer */ rqe->sdno = sd->sdno; /* subdisk number */ rqe->driveno = sd->driveno; if (build_rq_buffer(rqe, plex)) /* build the buffer */ return REQUEST_ENOMEM; /* can't do it */ if ((m.flags & XFR_PARITYOP) /* parity operation, */ &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */ rqe->b.b_flags |= B_READ; /* we must read first */ /* Now update pointers for the next block */ *diskaddr += m.datalen; /* skip past what we've done */ m.stripesectors -= m.datalen; /* deduct from what's left */ m.useroffset += m.datalen; /* and move on in the user buffer */ m.datalen = min(m.stripesectors, plex->stripesize); /* and recalculate */ m.dataoffset = 0; /* start at the beginning of next block */ } /* * 3: REMAINING BLOCKS FOR RECOVERY * Finally, if we have a recovery operation, build * up transfers for the other subdisks. Follow the * subdisks around until we get to where we started. * These requests use only the group parameters. */ if ((rqno < m.rqcount) /* haven't done them all already */ &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) { for (; rqno < m.rqcount; rqno++, mysdno++) { if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ if (mysdno == plex->subdisks) /* got to the end, */ mysdno = 0; /* wrap around */ if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ rqe = &rqg->rqe[rqno]; /* point to element */ sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ rqe->rqg = rqg; /* point to group */ rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */ rqe->dataoffset = 0; /* for tidiness' sake */ rqe->groupoffset = 0; /* group starts at the beginining */ rqe->datalen = 0; rqe->grouplen = m.grouplen; rqe->buflen = m.grouplen; rqe->flags = (m.flags | XFR_MALLOCED) /* transfer flags without data op stuf */ &~XFR_DATAOP; rqe->sdno = sd->sdno; /* subdisk number */ rqe->driveno = sd->driveno; if (build_rq_buffer(rqe, plex)) /* build the buffer */ return REQUEST_ENOMEM; /* can't do it */ rqe->b.b_flags |= B_READ; /* we must read first */ } } /* * We need to lock the address range before * doing anything. We don't have to be * performing a recovery operation: somebody * else could be doing so, and the results could * influence us. Note the fact here, we'll perform * the lock in launch_requests. */ rqg->lockbase = m.stripebase; if (*diskaddr < diskend) /* didn't finish the request on this stripe */ plex->multistripe++; /* count another one */ } return REQUEST_OK; }
/* * Take a completed buffer, transfer the data back if * it's a read, and complete the high-level request * if this is the last subrequest. * * The bp parameter is in fact a struct rqelement, which * includes a couple of extras at the end. */ void complete_rqe(struct bio *bio) { union daemoninfo di; struct buf *bp = bio->bio_buf; struct rqelement *rqe; struct request *rq; struct rqgroup *rqg; struct bio *ubio; /* user buffer */ struct drive *drive; struct sd *sd; char *gravity; /* for error messages */ get_mplock(); rqe = (struct rqelement *) bp; /* point to the element that completed */ rqg = rqe->rqg; /* and the request group */ rq = rqg->rq; /* and the complete request */ ubio = rq->bio; /* user buffer */ #ifdef VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_iodone, (union rqinfou) rqe, ubio); #endif drive = &DRIVE[rqe->driveno]; drive->active--; /* one less outstanding I/O on this drive */ vinum_conf.active--; /* one less outstanding I/O globally */ if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */ ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */ wakeup(&launch_requests); /* let another one at it */ if ((bp->b_flags & B_ERROR) != 0) { /* transfer in error */ gravity = ""; sd = &SD[rqe->sdno]; if (bp->b_error != 0) /* did it return a number? */ rq->error = bp->b_error; /* yes, put it in. */ else if (rq->error == 0) /* no: do we have one already? */ rq->error = EIO; /* no: catchall "I/O error" */ sd->lasterror = rq->error; if (bp->b_cmd == BUF_CMD_READ) { if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { gravity = " fatal"; set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */ } log(LOG_ERR, "%s:%s read error, offset %lld for %d bytes\n", gravity, sd->name, (long long)bio->bio_offset, bp->b_bcount); } else { /* write operation */ if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { gravity = "fatal "; set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */ } log(LOG_ERR, "%s:%s write error, offset %lld for %d bytes\n", gravity, sd->name, (long long)bio->bio_offset, bp->b_bcount); } log(LOG_ERR, "%s: user buffer offset %lld for %d bytes\n", sd->name, (long long)ubio->bio_offset, ubio->bio_buf->b_bcount); if (rq->error == ENXIO) { /* the drive's down too */ log(LOG_ERR, "%s: fatal drive I/O error, offset %lld for %d bytes\n", DRIVE[rqe->driveno].label.name, (long long)bio->bio_offset, bp->b_bcount); DRIVE[rqe->driveno].lasterror = rq->error; set_drive_state(rqe->driveno, /* take the drive down */ drive_down, setstate_force); } } /* Now update the statistics */ if (bp->b_cmd == BUF_CMD_READ) { /* read operation */ DRIVE[rqe->driveno].reads++; DRIVE[rqe->driveno].bytes_read += bp->b_bcount; SD[rqe->sdno].reads++; SD[rqe->sdno].bytes_read += bp->b_bcount; PLEX[rqe->rqg->plexno].reads++; PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount; if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ VOL[PLEX[rqe->rqg->plexno].volno].reads++; VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount; } } else { /* write operation */ DRIVE[rqe->driveno].writes++; DRIVE[rqe->driveno].bytes_written += bp->b_bcount; SD[rqe->sdno].writes++; SD[rqe->sdno].bytes_written += bp->b_bcount; PLEX[rqe->rqg->plexno].writes++; PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ VOL[PLEX[rqe->rqg->plexno].volno].writes++; VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount; } } if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ int *sdata; /* source */ int *data; /* and group data */ int length; /* and count involved */ int count; /* loop counter */ struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ /* XOR destination is the user data */ sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ for (count = 0; count < length; count++) data[count] ^= sdata[count]; /* * In a normal read, we will normally read directly * into the user buffer. This doesn't work if * we're also doing a recovery, so we have to * copy it */ if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ char *dst; dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ length = rqe->datalen << DEV_BSHIFT; /* and count involved */ bcopy(src, dst, length); /* move it */ } } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */ &&(rqg->active == 1)) /* and this is the last active request */ complete_raid5_write(rqe); /* * This is the earliest place where we can be * sure that the request has really finished, * since complete_raid5_write can issue new * requests. */ rqg->active--; /* this request now finished */ if (rqg->active == 0) { /* request group finished, */ rq->active--; /* one less */ if (rqg->lock) { /* got a lock? */ unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ rqg->lock = 0; } } if (rq->active == 0) { /* request finished, */ #ifdef VINUMDEBUG if (debug & DEBUG_RESID) { if (ubio->bio_buf->b_resid != 0) /* still something to transfer? */ Debugger("resid"); } #endif if (rq->error) { /* did we have an error? */ if (rq->isplex) { /* plex operation, */ ubio->bio_buf->b_flags |= B_ERROR; /* yes, propagate to user */ ubio->bio_buf->b_error = rq->error; } else { /* try to recover */ di.rq = rq; queue_daemon_request(daemonrq_ioerror, di); /* let the daemon complete */ } } else { ubio->bio_buf->b_resid = 0; /* completed our transfer */ if (rq->isplex == 0) /* volume request, */ VOL[rq->volplex.volno].active--; /* another request finished */ biodone(ubio); /* top level buffer completed */ freerq(rq); /* return the request storage */ } } rel_mplock(); }
/* attach an object to a superior object */ void attachobject(struct vinum_ioctl_msg *msg) { struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; int sdno; struct sd *sd; struct plex *plex; struct volume *vol; switch (msg->type) { case drive_object: /* you can't attach a drive to anything */ case volume_object: /* nor a volume */ case invalid_object: /* "this can't happen" */ reply->error = EINVAL; reply->msg[0] = '\0'; /* vinum(8) doesn't do this */ return; case sd_object: sd = validsd(msg->index, reply); if (sd == NULL) /* not a valid subdisk */ return; plex = validplex(msg->otherobject, reply); if (plex) { /* * We should be more intelligent about this. * We should be able to reattach a dead * subdisk, but if we want to increase the total * number of subdisks, we have a lot of reshuffling * to do. XXX */ if ((plex->organization != plex_concat) /* can't attach to striped and RAID-4/5 */ &&(!msg->force)) { /* without using force */ reply->error = EINVAL; /* no message, the user should check */ strcpy(reply->msg, "Can't attach to this plex organization"); return; } if (sd->plexno >= 0) { /* already belong to a plex */ reply->error = EBUSY; /* no message, the user should check */ reply->msg[0] = '\0'; return; } sd->plexoffset = msg->offset; /* this is where we want it */ set_sd_state(sd->sdno, sd_stale, setstate_force); /* make sure it's stale */ give_sd_to_plex(plex->plexno, sd->sdno); /* and give it to the plex */ update_sd_config(sd->sdno, 0); save_config(); } if (sd->state == sd_reviving) reply->error = EAGAIN; /* need to revive it */ else reply->error = 0; break; case plex_object: plex = validplex(msg->index, reply); /* get plex */ if (plex == NULL) return; vol = validvol(msg->otherobject, reply); /* and volume information */ if (vol) { if ((vol->plexes == MAXPLEX) /* we have too many already */ ||(plex->volno >= 0)) { /* or the plex has an owner */ reply->error = EINVAL; /* no message, the user should check */ reply->msg[0] = '\0'; return; } for (sdno = 0; sdno < plex->subdisks; sdno++) { sd = &SD[plex->sdnos[sdno]]; if (sd->state > sd_down) /* real subdisk, vaguely accessible */ set_sd_state(plex->sdnos[sdno], sd_stale, setstate_force); /* make it stale */ } set_plex_state(plex->plexno, plex_up, setstate_none); /* update plex state */ give_plex_to_volume(msg->otherobject, msg->index); /* and give it to the volume */ update_plex_config(plex->plexno, 0); save_config(); reply->error = 0; /* all went well */ } } }
/* Perform I/O on a subdisk */ void sdio(struct buf *bp) { int s; /* spl */ struct sd *sd; struct sdbuf *sbp; daddr_t endoffset; struct drive *drive; #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdio, (union rqinfou) bp, bp); #endif sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */ drive = &DRIVE[sd->driveno]; if (drive->state != drive_up) { if (sd->state >= sd_crashed) { if (bp->b_flags & B_READ) /* reading, */ set_sd_state(sd->sdno, sd_crashed, setstate_force); else set_sd_state(sd->sdno, sd_stale, setstate_force); } bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } /* * We allow access to any kind of subdisk as long as we can expect * to get the I/O performed. */ if (sd->state < sd_empty) { /* nothing to talk to, */ bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } /* Get a buffer */ sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf)); if (sbp == NULL) { bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return; } bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */ sbp->b.b_flags = bp->b_flags | B_CALL; /* inform us when it's done */ sbp->b.b_bufsize = bp->b_bufsize; /* buffer size */ sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */ sbp->b.b_resid = bp->b_resid; /* and amount waiting */ sbp->b.b_dev = DRIVE[sd->driveno].dev; /* device */ sbp->b.b_data = bp->b_data; /* data buffer */ sbp->b.b_blkno = bp->b_blkno + sd->driveoffset; sbp->b.b_iodone = sdio_done; /* come here on completion */ BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */ BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */ sbp->bp = bp; /* note the address of the original header */ sbp->sdno = sd->sdno; /* note for statistics */ sbp->driveno = sd->driveno; endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */ if (endoffset > sd->sectors) { /* beyond the end */ sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */ if (sbp->b.b_bcount <= 0) { /* nothing to transfer */ bp->b_resid = bp->b_bcount; /* nothing transferred */ biodone(bp); Free(sbp); return; } } #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", sbp->b.b_flags & B_READ ? "Read" : "Write", major(sbp->b.b_dev), minor(sbp->b.b_dev), sbp->sdno, (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset), (int) sbp->b.b_blkno, sbp->b.b_bcount); #endif s = splbio(); #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b); #endif BUF_STRATEGY(&sbp->b, 0); splx(s); }