Exemplo n.º 1
0
int
rf_FailDisk(RF_Raid_t *raidPtr, int fcol, int initRecon)
{

	/* need to suspend IO's here -- if there are DAGs in flight
	   and we pull the rug out from under ci_vp, Bad Things
	   can happen.  */

	rf_SuspendNewRequestsAndWait(raidPtr);

	RF_LOCK_MUTEX(raidPtr->mutex);
	if (raidPtr->Disks[fcol].status != rf_ds_failed) {
		/* must be failing something that is valid, or else it's
		   already marked as failed (in which case we don't
		   want to mark it failed again!) */
		raidPtr->numFailures++;
		raidPtr->Disks[fcol].status = rf_ds_failed;
		raidPtr->status = rf_rs_degraded;
	}
	RF_UNLOCK_MUTEX(raidPtr->mutex);

	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);

	/* Close the component, so that it's not "locked" if someone
	   else want's to use it! */

	rf_close_component(raidPtr, raidPtr->raid_cinfo[fcol].ci_vp,
			   raidPtr->Disks[fcol].auto_configured);

	RF_LOCK_MUTEX(raidPtr->mutex);
	raidPtr->raid_cinfo[fcol].ci_vp = NULL;

	/* Need to mark the component as not being auto_configured
	   (in case it was previously). */

	raidPtr->Disks[fcol].auto_configured = 0;
	RF_UNLOCK_MUTEX(raidPtr->mutex);
	/* now we can allow IO to continue -- we'll be suspending it
	   again in rf_ReconstructFailedDisk() if we have to.. */

	rf_ResumeNewRequests(raidPtr);

	if (initRecon)
		rf_ReconstructFailedDisk(raidPtr, fcol);
	return (0);
}
Exemplo n.º 2
0
/* Do a complete copyback. */
void
rf_CopybackReconstructedData(RF_Raid_t *raidPtr)
{
	RF_ComponentLabel_t c_label;
	int done, retcode;
	RF_CopybackDesc_t *desc;
	RF_RowCol_t frow, fcol;
	RF_RaidDisk_t *badDisk;
	char *databuf;

	struct partinfo dpart;
	struct vnode *vp;
	struct vattr va;
	struct proc *proc;

	int ac;

	done = 0;
	fcol = 0;
	for (frow = 0; frow < raidPtr->numRow; frow++) {
		for (fcol = 0; fcol < raidPtr->numCol; fcol++) {
			if (raidPtr->Disks[frow][fcol].status ==
			     rf_ds_dist_spared ||
			    raidPtr->Disks[frow][fcol].status ==
			     rf_ds_spared) {
				done = 1;
				break;
			}
		}
		if (done)
			break;
	}

	if (frow == raidPtr->numRow) {
		printf("COPYBACK: No disks need copyback.\n");
		return;
	}
	badDisk = &raidPtr->Disks[frow][fcol];

	proc = raidPtr->engine_thread;

	/*
	 * This device may have been opened successfully the first time.
	 * Close it before trying to open it again.
	 */

	if (raidPtr->raid_cinfo[frow][fcol].ci_vp != NULL) {
		printf("Close the opened device: %s.\n",
		    raidPtr->Disks[frow][fcol].devname);
 		vp = raidPtr->raid_cinfo[frow][fcol].ci_vp;
 		ac = raidPtr->Disks[frow][fcol].auto_configured;
 		rf_close_component(raidPtr, vp, ac);
		raidPtr->raid_cinfo[frow][fcol].ci_vp = NULL;

	}
 	/* Note that this disk was *not* auto_configured (any longer). */
 	raidPtr->Disks[frow][fcol].auto_configured = 0;

	printf("About to (re-)open the device: %s.\n",
	    raidPtr->Disks[frow][fcol].devname);

	retcode = raidlookup(raidPtr->Disks[frow][fcol].devname, proc, &vp);

	if (retcode) {
		printf("COPYBACK: raidlookup on device: %s failed: %d !\n",
		    raidPtr->Disks[frow][fcol].devname, retcode);

		/*
		 * XXX The component isn't responding properly... Must be
		 * still dead :-(
		 */
		return;

	} else {

		/*
		 * Ok, so we can at least do a lookup...
		 * How about actually getting a vp for it ?
		 */

		if ((retcode = VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0)
		{
			return;
		}
		retcode = VOP_IOCTL(vp, DIOCGPART, (caddr_t) &dpart, FREAD,
		    proc->p_ucred, proc);
		if (retcode) {
			return;
		}
		raidPtr->Disks[frow][fcol].blockSize = dpart.disklab->d_secsize;

		raidPtr->Disks[frow][fcol].numBlocks = dpart.part->p_size -
		    rf_protectedSectors;

		raidPtr->raid_cinfo[frow][fcol].ci_vp = vp;
		raidPtr->raid_cinfo[frow][fcol].ci_dev = va.va_rdev;

		/* XXX Or the above ? */
		raidPtr->Disks[frow][fcol].dev = va.va_rdev;

		/*
		 * We allow the user to specify that only a fraction of the
		 * disks should be used this is just for debug: it speeds up
		 * the parity scan.
		 */
		raidPtr->Disks[frow][fcol].numBlocks =
		    raidPtr->Disks[frow][fcol].numBlocks *
		    rf_sizePercentage / 100;
	}
#if 0
	/* This is the way it was done before the CAM stuff was removed. */

	if (rf_extract_ids(badDisk->devname, &bus, &targ, &lun)) {
		printf("COPYBACK: unable to extract bus, target, lun from"
		    " devname %s.\n", badDisk->devname);
		return;
	}
	/*
	 * TUR the disk that's marked as bad to be sure that it's actually
	 * alive.
	 */
	rf_SCSI_AllocTUR(&tur_op);
	retcode = rf_SCSI_DoTUR(tur_op, bus, targ, lun, badDisk->dev);
	rf_SCSI_FreeDiskOp(tur_op, 0);
#endif

	if (retcode) {
		printf("COPYBACK: target disk failed TUR.\n");
		return;
	}
	/* Get a buffer to hold one SU. */
	RF_Malloc(databuf, rf_RaidAddressToByte(raidPtr,
	    raidPtr->Layout.sectorsPerStripeUnit), (char *));

	/* Create a descriptor. */
	RF_Malloc(desc, sizeof(*desc), (RF_CopybackDesc_t *));
	desc->raidPtr = raidPtr;
	desc->status = 0;
	desc->frow = frow;
	desc->fcol = fcol;
	desc->spRow = badDisk->spareRow;
	desc->spCol = badDisk->spareCol;
	desc->stripeAddr = 0;
	desc->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
	desc->sectPerStripe = raidPtr->Layout.sectorsPerStripeUnit *
	    raidPtr->Layout.numDataCol;
	desc->databuf = databuf;
	desc->mcpair = rf_AllocMCPair();

	printf("COPYBACK: Quiescing the array.\n");
	/*
	 * Quiesce the array, since we don't want to code support for user
	 * accs here.
	 */
	rf_SuspendNewRequestsAndWait(raidPtr);

	/* Adjust state of the array and of the disks. */
	RF_LOCK_MUTEX(raidPtr->mutex);
	raidPtr->Disks[desc->frow][desc->fcol].status = rf_ds_optimal;
	raidPtr->status[desc->frow] = rf_rs_optimal;
	rf_copyback_in_progress = 1;	/* Debug only. */
	RF_UNLOCK_MUTEX(raidPtr->mutex);

	printf("COPYBACK: Beginning\n");
	RF_GETTIME(desc->starttime);
	rf_ContinueCopyback(desc);

	/*
	 * Data has been restored.
	 * Fix up the component label.
	 * Don't actually need the read here.
	 */
	raidread_component_label(raidPtr->raid_cinfo[frow][fcol].ci_dev,
				 raidPtr->raid_cinfo[frow][fcol].ci_vp,
				 &c_label);

	raid_init_component_label(raidPtr, &c_label);

	c_label.row = frow;
	c_label.column = fcol;

	raidwrite_component_label(raidPtr->raid_cinfo[frow][fcol].ci_dev,
				  raidPtr->raid_cinfo[frow][fcol].ci_vp,
				  &c_label);
}