示例#1
0
/*
 * The access has an array of dagLists, one dagList per parity stripe.
 * Fire the first DAG in each parity stripe (dagList).
 * DAGs within a stripe (dagList) must be executed sequentially.
 *  - This preserves atomic parity update.
 * DAGs for independents parity groups (stripes) are fired concurrently.
 */
int
rf_State_ExecuteDAG(RF_RaidAccessDesc_t *desc)
{
	int i;
	RF_DagHeader_t *dag_h;
	RF_DagList_t *dagArray = desc->dagArray;

	/*
	 * Next state is always rf_State_ProcessDAG. Important to do this
	 * before firing the first dag (it may finish before we leave this
	 * routine).
	 */
	desc->state++;

	/*
	 * Sweep dag array, a stripe at a time, firing the first dag in each
	 * stripe.
	 */
	for (i = 0; i < desc->numStripes; i++) {
		RF_ASSERT(dagArray[i].numDags > 0);
		RF_ASSERT(dagArray[i].numDagsDone == 0);
		RF_ASSERT(dagArray[i].numDagsFired == 0);
		RF_ETIMER_START(dagArray[i].tracerec.timer);
		/* Fire first dag in this stripe. */
		dag_h = dagArray[i].dags;
		RF_ASSERT(dag_h);
		dagArray[i].numDagsFired++;
		/*
		 * XXX Yet another case where we pass in a conflicting
		 * function pointer :-(  XXX  GO
		 */
		rf_DispatchDAG(dag_h, (void (*) (void *)) rf_ContinueDagAccess,
		    &dagArray[i]);
	}

	/*
	 * The DAG will always call the callback, even if there was no
	 * blocking, so we are always suspended in this state.
	 */
	return RF_TRUE;
}
/***************************************************************************************
* 	This function is called by double degragded read
* 	EO_200_CreateReadDAG
*
***************************************************************************************/
int
rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node)
{
	int     ndataParam = 0;
	int     np = node->numParams;
	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
	int     i, prm, sector, nresults = node->numResults;
	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
	unsigned sosAddr;
	int     mallc_one = 0, mallc_two = 0;	/* flags to indicate if
						 * memory is allocated */
	int     bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
	RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
	        npda;
	RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
	char  **buf, *ebuf, *pbuf, *dest[2];
	long   *suoff = NULL, *suend = NULL, *prmToCol = NULL,
	    psuoff = 0, esuoff = 0;
	RF_SectorNum_t startSector, endSector;
	RF_Etimer_t timer;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;

	RF_ETIMER_START(timer);

	/* Find out the number of parameters which are pdas for data
	 * information */
	for (i = 0; i <= np; i++)
		if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) {
			ndataParam = i;
			break;
		}
	RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
	if (ndataParam != 0) {
		RF_Malloc(suoff, ndataParam * sizeof(long), (long *));
		RF_Malloc(suend, ndataParam * sizeof(long), (long *));
		RF_Malloc(prmToCol, ndataParam * sizeof(long), (long *));
	}
/**************************************************************************************
 * when parity die and one data die, We use second redundant information, 'E',
 * to recover the data in dead disk. This function is used in the recovery node of
 * for EO_110_CreateReadDAG
 **************************************************************************************/
int
rf_RecoveryEFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
	RF_RowCol_t scol,	/* source logical column */
	        fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress);	/* logical column of
									 * failed SU */
	int     i;
	RF_PhysDiskAddr_t *pda;
	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
	char   *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;

	memset((char *) node->results[0], 0,
	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
	if (node->dagHdr->status == rf_enable) {
		RF_ETIMER_START(timer);
		for (i = 0; i < node->numParams - 2; i += 2)
			if (node->params[i + 1].p != node->results[0]) {
				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
				if (i == node->numParams - 4)
					scol = RF_EO_MATRIX_DIM - 2;	/* the colume of
									 * redundant E */
				else
					scol = rf_EUCol(layoutPtr, pda->raidAddress);
				srcbuf = (char *) node->params[i + 1].p;
				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
				rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
			}
		RF_ETIMER_STOP(timer);
		RF_ETIMER_EVAL(timer);
		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
	}
	return (rf_GenericWakeupFunc(node, 0));	/* node execute successfully */
}
int
rf_SimpleONEFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
	int     retcode = 0;
	char   *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	int     length;
	RF_RowCol_t scol;
	RF_Etimer_t timer;

	RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q);
	if (node->dagHdr->status == rf_enable) {
		RF_ETIMER_START(timer);
		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector);	/* this is a pda of
														 * writeDataNodes */
		/* bxor to buffer of readDataNodes */
		retcode = rf_bxor(node->params[5].p, node->params[1].p, length);
		/* find out the corresponding colume in encoding matrix for
		 * write colume to be encoded into redundant disk 'E' */
		scol = rf_EUCol(layoutPtr, pda->raidAddress);
		srcbuf = node->params[1].p;
		destbuf = node->params[3].p;
		/* Start encoding process */
		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
		rf_bxor(node->params[5].p, node->params[1].p, length);
		RF_ETIMER_STOP(timer);
		RF_ETIMER_EVAL(timer);
		tracerec->q_us += RF_ETIMER_VAL_US(timer);

	}
	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
							 * explicitly since no
							 * I/O in this node */
}
示例#5
0
int
rf_State_Map(RF_RaidAccessDesc_t *desc)
{
	RF_Raid_t *raidPtr = desc->raidPtr;
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;

	RF_ETIMER_START(timer);
#endif

	if (!(desc->asmap = rf_MapAccess(raidPtr, desc->raidAddress, desc->numBlocks,
		    desc->bufPtr, RF_DONT_REMAP)))
		RF_PANIC();

#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.map_us = RF_ETIMER_VAL_US(timer);
#endif

	desc->state++;
	return RF_FALSE;
}
示例#6
0
/* Enqueue a disk I/O
 *
 * In the kernel, I/O is non-blocking and so we'd like to have multiple
 * I/Os outstanding on the physical disks when possible.
 *
 * when any request arrives at a queue, we have two choices:
 *    dispatch it to the lower levels
 *    queue it up
 *
 * kernel rules for when to do what:
 *    unlocking req  :  always dispatch it
 *    normal req     :  queue empty => dispatch it & set priority
 *                      queue not full & priority is ok => dispatch it
 *                      else queue it
 */
void
rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
{
	RF_ETIMER_START(req->qtime);
	RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
	req->priority = pri;

#if RF_DEBUG_DISKQUEUE
	if (rf_queueDebug && (req->numSector == 0)) {
		printf("Warning: Enqueueing zero-sector access\n");
	}
#endif
	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
	if (RF_OK_TO_DISPATCH(queue, req)) {
		Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
		rf_DispatchKernelIO(queue, req);
	} else {
		queue->queueLength++;	/* increment count of number of requests waiting in this queue */
		Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
		req->queue = (void *) queue;
		(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
	}
	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
}
示例#7
0
/* only make it this far if all dags complete successfully */
int
rf_State_Cleanup(RF_RaidAccessDesc_t *desc)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
#endif
	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_AccessStripeMap_t *asm_p;
	RF_DagList_t *dagList;
	int i;

	desc->state++;

#if RF_ACC_TRACE > 0
	timer = tracerec->timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.dag_retry_us = RF_ETIMER_VAL_US(timer);

	/* the RAID I/O is complete.  Clean up. */
	tracerec->specific.user.dag_retry_us = 0;

	RF_ETIMER_START(timer);
#endif
	/* free all dags */
	dagList = desc->dagList;
	for (i = 0; i < desc->numStripes; i++) {
		rf_FreeDAG(dagList->dags);
		dagList = dagList->next;
	}
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.cleanup_us = RF_ETIMER_VAL_US(timer);

	RF_ETIMER_START(timer);
#endif
	for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
		if (!rf_suppressLocksAndLargeWrites &&
		    asm_p->parityInfo &&
		    !(desc->flags & RF_DAG_SUPPRESS_LOCKS)) {
			RF_ASSERT_VALID_LOCKREQ(&asm_p->lockReqDesc);
			rf_ReleaseStripeLock(raidPtr->lockTable,
					     asm_p->stripeID,
					     &asm_p->lockReqDesc);
		}
		if (asm_p->flags & RF_ASM_FLAGS_RECON_BLOCKED) {
			rf_UnblockRecon(raidPtr, asm_p);
		}
	}
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);

	RF_ETIMER_START(timer);
#endif
	rf_FreeAccessStripeMap(asmh);
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.cleanup_us += RF_ETIMER_VAL_US(timer);

	RF_ETIMER_STOP(desc->timer);
	RF_ETIMER_EVAL(desc->timer);

	timer = desc->tracerec.tot_timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	desc->tracerec.total_us = RF_ETIMER_VAL_US(timer);

	rf_LogTraceRec(raidPtr, tracerec);
#endif
	desc->flags |= RF_DAG_ACCESS_COMPLETE;

	return RF_FALSE;
}
示例#8
0
int
rf_State_ProcessDAG(RF_RaidAccessDesc_t *desc)
{
	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_DagHeader_t *dag_h;
	int     i, j, done = RF_TRUE;
	RF_DagList_t *dagList, *temp;

	/* check to see if this is the last dag */
	dagList = desc->dagList;
	for (i = 0; i < desc->numStripes; i++) {
		if (dagList->numDags != dagList->numDagsDone)
			done = RF_FALSE;
		dagList = dagList->next;
	}

	if (done) {
		if (desc->status) {
			/* a dag failed, retry */
			/* free all dags */
			dagList = desc->dagList;
			for (i = 0; i < desc->numStripes; i++) {
				rf_FreeDAG(dagList->dags);
				temp = dagList;
				dagList = dagList->next;
				rf_FreeDAGList(temp);
			}
			desc->dagList = NULL;

			rf_MarkFailuresInASMList(raidPtr, asmh);

			/* note the retry so that we'll bail in
			   rf_State_CreateDAG() once we've retired
			   the IO RF_RETRY_THRESHOLD times */

			desc->numRetries++;

			/* back up to rf_State_CreateDAG */
			desc->state = desc->state - 2;
			return RF_FALSE;
		} else {
			/* move on to rf_State_Cleanup */
			desc->state++;
		}
		return RF_FALSE;
	} else {
		/* more dags to execute */
		/* see if any are ready to be fired.  if so, fire them */
		/* don't fire the initial dag in a list, it's fired in
		 * rf_State_ExecuteDAG */
		dagList = desc->dagList;
		for (i = 0; i < desc->numStripes; i++) {
			if ((dagList->numDagsDone < dagList->numDags)
			    && (dagList->numDagsDone == dagList->numDagsFired)
			    && (dagList->numDagsFired > 0)) {
#if RF_ACC_TRACE > 0
				RF_ETIMER_START(dagList->tracerec.timer);
#endif
				/* fire next dag in this stripe */
				/* first, skip to next dag awaiting execution */
				dag_h = dagList->dags;
				for (j = 0; j < dagList->numDagsDone; j++)
					dag_h = dag_h->next;
				dagList->numDagsFired++;
				rf_DispatchDAG(dag_h, (void (*) (void *)) rf_ContinueDagAccess,
				    dagList);
			}
			dagList = dagList->next;
		}
		return RF_TRUE;
	}
}
示例#9
0
/*
 * the following three states create, execute, and post-process dags
 * the error recovery unit is a single dag.
 * by default, SelectAlgorithm creates an array of dags, one per parity stripe
 * in some tricky cases, multiple dags per stripe are created
 *   - dags within a parity stripe are executed sequentially (arbitrary order)
 *   - dags for distinct parity stripes are executed concurrently
 *
 * repeat until all dags complete successfully -or- dag selection fails
 *
 * while !done
 *   create dag(s) (SelectAlgorithm)
 *   if dag
 *     execute dag (DispatchDAG)
 *     if dag successful
 *       done (SUCCESS)
 *     else
 *       !done (RETRY - start over with new dags)
 *   else
 *     done (FAIL)
 */
int
rf_State_CreateDAG(RF_RaidAccessDesc_t *desc)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
#endif
	RF_DagHeader_t *dag_h;
	RF_DagList_t *dagList;
	struct buf *bp;
	int     i, selectStatus;

	/* generate a dag for the access, and fire it off.  When the dag
	 * completes, we'll get re-invoked in the next state. */
#if RF_ACC_TRACE > 0
	RF_ETIMER_START(timer);
#endif
	/* SelectAlgorithm returns one or more dags */
	selectStatus = rf_SelectAlgorithm(desc, desc->flags | RF_DAG_SUPPRESS_LOCKS);
#if RF_DEBUG_VALIDATE_DAG
	if (rf_printDAGsDebug) {
		dagList = desc->dagList;
		for (i = 0; i < desc->numStripes; i++) {
			rf_PrintDAGList(dagList->dags);
			dagList = dagList->next;
		}
	}
#endif /* RF_DEBUG_VALIDATE_DAG */
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	/* update time to create all dags */
	tracerec->specific.user.dag_create_us = RF_ETIMER_VAL_US(timer);
#endif

	desc->status = 0;	/* good status */

	if (selectStatus || (desc->numRetries > RF_RETRY_THRESHOLD)) {
		/* failed to create a dag */
		/* this happens when there are too many faults or incomplete
		 * dag libraries */
		if (selectStatus) {
			printf("raid%d: failed to create a dag. "
			       "Too many component failures.\n",
			       desc->raidPtr->raidid);
		} else {
			printf("raid%d: IO failed after %d retries.\n",
			       desc->raidPtr->raidid, RF_RETRY_THRESHOLD);
		}

		desc->status = 1; /* bad status */
		/* skip straight to rf_State_Cleanup() */
		desc->state = rf_CleanupState;
		bp = (struct buf *)desc->bp;
		bp->b_error = EIO;
		bp->b_resid = bp->b_bcount;
	} else {
		/* bind dags to desc */
		dagList = desc->dagList;
		for (i = 0; i < desc->numStripes; i++) {
			dag_h = dagList->dags;
			while (dag_h) {
				dag_h->bp = (struct buf *) desc->bp;
#if RF_ACC_TRACE > 0
				dag_h->tracerec = tracerec;
#endif
				dag_h = dag_h->next;
			}
			dagList = dagList->next;
		}
		desc->flags |= RF_DAG_DISPATCH_RETURNED;
		desc->state++;	/* next state should be rf_State_ExecuteDAG */
	}
	return RF_FALSE;
}
示例#10
0
int
rf_State_Lock(RF_RaidAccessDesc_t *desc)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
#endif
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
	RF_AccessStripeMap_t *asm_p;
	RF_StripeNum_t lastStripeID = -1;
	int     suspended = RF_FALSE;

#if RF_ACC_TRACE > 0
	RF_ETIMER_START(timer);
#endif

	/* acquire each lock that we don't already hold */
	for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
		RF_ASSERT(RF_IO_IS_R_OR_W(desc->type));
		if (!rf_suppressLocksAndLargeWrites &&
		    asm_p->parityInfo &&
		    !(desc->flags & RF_DAG_SUPPRESS_LOCKS) &&
		    !(asm_p->flags & RF_ASM_FLAGS_LOCK_TRIED)) {
			asm_p->flags |= RF_ASM_FLAGS_LOCK_TRIED;
				/* locks must be acquired hierarchically */
			RF_ASSERT(asm_p->stripeID > lastStripeID);
			lastStripeID = asm_p->stripeID;

			RF_INIT_LOCK_REQ_DESC(asm_p->lockReqDesc, desc->type,
					      (void (*) (struct buf *)) rf_ContinueRaidAccess, desc, asm_p,
					      raidPtr->Layout.dataSectorsPerStripe);
			if (rf_AcquireStripeLock(raidPtr->lockTable, asm_p->stripeID,
						 &asm_p->lockReqDesc)) {
				suspended = RF_TRUE;
				break;
			}
		}
		if (desc->type == RF_IO_TYPE_WRITE &&
		    raidPtr->status == rf_rs_reconstructing) {
			if (!(asm_p->flags & RF_ASM_FLAGS_FORCE_TRIED)) {
				int     val;

				asm_p->flags |= RF_ASM_FLAGS_FORCE_TRIED;
				val = rf_ForceOrBlockRecon(raidPtr, asm_p,
							   (void (*) (RF_Raid_t *, void *)) rf_ContinueRaidAccess, desc);
				if (val == 0) {
					asm_p->flags |= RF_ASM_FLAGS_RECON_BLOCKED;
				} else {
					suspended = RF_TRUE;
					break;
				}
			} else {
#if RF_DEBUG_PSS > 0
				if (rf_pssDebug) {
					printf("raid%d: skipping force/block because already done, psid %ld\n",
					       desc->raidPtr->raidid,
					       (long) asm_p->stripeID);
				}
#endif
			}
		} else {
#if RF_DEBUG_PSS > 0
			if (rf_pssDebug) {
				printf("raid%d: skipping force/block because not write or not under recon, psid %ld\n",
				       desc->raidPtr->raidid,
				       (long) asm_p->stripeID);
			}
#endif
		}
	}
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);
#endif
	if (suspended)
		return (RF_TRUE);

	desc->state++;
	return (RF_FALSE);
}
示例#11
0
int
rf_State_Quiesce(RF_RaidAccessDesc_t *desc)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
#endif
	RF_CallbackDesc_t *cb;
	RF_Raid_t *raidPtr;
	int     suspended = RF_FALSE;
	int need_cb, used_cb;

	raidPtr = desc->raidPtr;

#if RF_ACC_TRACE > 0
	RF_ETIMER_START(timer);
	RF_ETIMER_START(desc->timer);
#endif

	need_cb = 0;
	used_cb = 0;
	cb = NULL;

	rf_lock_mutex2(raidPtr->access_suspend_mutex);
	/* Do an initial check to see if we might need a callback structure */
	if (raidPtr->accesses_suspended) {
		need_cb = 1;
	}
	rf_unlock_mutex2(raidPtr->access_suspend_mutex);

	if (need_cb) {
		/* create a callback if we might need it...
		   and we likely do. */
		cb = rf_AllocCallbackDesc();
	}

	rf_lock_mutex2(raidPtr->access_suspend_mutex);
	if (raidPtr->accesses_suspended) {
		cb->callbackFunc = (void (*) (RF_CBParam_t)) rf_ContinueRaidAccess;
		cb->callbackArg.p = (void *) desc;
		cb->next = raidPtr->quiesce_wait_list;
		raidPtr->quiesce_wait_list = cb;
		suspended = RF_TRUE;
		used_cb = 1;
	}
	rf_unlock_mutex2(raidPtr->access_suspend_mutex);

	if ((need_cb == 1) && (used_cb == 0)) {
		rf_FreeCallbackDesc(cb);
	}

#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.suspend_ovhd_us += RF_ETIMER_VAL_US(timer);
#endif

#if RF_DEBUG_QUIESCE
	if (suspended && rf_quiesceDebug)
		printf("Stalling access due to quiescence lock\n");
#endif
	desc->state++;
	return suspended;
}
示例#12
0
/* Only make it this far if all dags complete successfully. */
int
rf_State_Cleanup(RF_RaidAccessDesc_t *desc)
{
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_AccessStripeMap_t *asm_p;
	RF_DagHeader_t *dag_h;
	RF_Etimer_t timer;
	int i;

	desc->state++;

	timer = tracerec->timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.dag_retry_us = RF_ETIMER_VAL_US(timer);

	/* The RAID I/O is complete. Clean up. */
	tracerec->specific.user.dag_retry_us = 0;

	RF_ETIMER_START(timer);
	if (desc->flags & RF_DAG_RETURN_DAG) {
		/* Copy dags into paramDAG. */
		*(desc->paramDAG) = desc->dagArray[0].dags;
		dag_h = *(desc->paramDAG);
		for (i = 1; i < desc->numStripes; i++) {
			/* Concatenate dags from remaining stripes. */
			RF_ASSERT(dag_h);
			while (dag_h->next)
				dag_h = dag_h->next;
			dag_h->next = desc->dagArray[i].dags;
		}
	} else {
		/* Free all dags. */
		for (i = 0; i < desc->numStripes; i++) {
			rf_FreeDAG(desc->dagArray[i].dags);
		}
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.cleanup_us = RF_ETIMER_VAL_US(timer);

	RF_ETIMER_START(timer);
	if (!(raidPtr->Layout.map->flags & RF_NO_STRIPE_LOCKS)) {
		for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
			if (!rf_suppressLocksAndLargeWrites &&
			    asm_p->parityInfo &&
			    !(desc->flags & RF_DAG_SUPPRESS_LOCKS)) {
				RF_ASSERT_VALID_LOCKREQ(&asm_p->lockReqDesc);
				rf_ReleaseStripeLock(raidPtr->lockTable,
				    asm_p->stripeID, &asm_p->lockReqDesc);
			}
			if (asm_p->flags & RF_ASM_FLAGS_RECON_BLOCKED) {
				rf_UnblockRecon(raidPtr, asm_p);
			}
		}
	}
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);

	RF_ETIMER_START(timer);
	if (desc->flags & RF_DAG_RETURN_ASM)
		*(desc->paramASM) = asmh;
	else
		rf_FreeAccessStripeMap(asmh);
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.cleanup_us += RF_ETIMER_VAL_US(timer);

	RF_ETIMER_STOP(desc->timer);
	RF_ETIMER_EVAL(desc->timer);

	timer = desc->tracerec.tot_timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	desc->tracerec.total_us = RF_ETIMER_VAL_US(timer);

	rf_LogTraceRec(raidPtr, tracerec);

	desc->flags |= RF_DAG_ACCESS_COMPLETE;

	return RF_FALSE;
}
示例#13
0
/*
 * rf_State_ProcessDAG is entered when a dag completes.
 * First, check that all DAGs in the access have completed.
 * If not, fire as many DAGs as possible.
 */
int
rf_State_ProcessDAG(RF_RaidAccessDesc_t *desc)
{
	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_DagHeader_t *dag_h;
	int i, j, done = RF_TRUE;
	RF_DagList_t *dagArray = desc->dagArray;
	RF_Etimer_t timer;

	/* Check to see if this is the last dag. */
	for (i = 0; i < desc->numStripes; i++)
		if (dagArray[i].numDags != dagArray[i].numDagsDone)
			done = RF_FALSE;

	if (done) {
		if (desc->status) {
			/* A dag failed, retry. */
			RF_ETIMER_START(timer);
			/* Free all dags. */
			for (i = 0; i < desc->numStripes; i++) {
				rf_FreeDAG(desc->dagArray[i].dags);
			}
			rf_MarkFailuresInASMList(raidPtr, asmh);
			/* Back up to rf_State_CreateDAG. */
			desc->state = desc->state - 2;
			return RF_FALSE;
		} else {
			/* Move on to rf_State_Cleanup. */
			desc->state++;
		}
		return RF_FALSE;
	} else {
		/* More dags to execute. */
		/* See if any are ready to be fired. If so, fire them. */
		/*
		 * Don't fire the initial dag in a list, it's fired in
		 * rf_State_ExecuteDAG.
		 */
		for (i = 0; i < desc->numStripes; i++) {
			if ((dagArray[i].numDagsDone < dagArray[i].numDags) &&
			    (dagArray[i].numDagsDone ==
			     dagArray[i].numDagsFired) &&
			    (dagArray[i].numDagsFired > 0)) {
				RF_ETIMER_START(dagArray[i].tracerec.timer);
				/* Fire next dag in this stripe. */
				/*
				 * First, skip to next dag awaiting execution.
				 */
				dag_h = dagArray[i].dags;
				for (j = 0; j < dagArray[i].numDagsDone; j++)
					dag_h = dag_h->next;
				dagArray[i].numDagsFired++;
				/*
				 * XXX And again we pass a different function
				 * pointer... GO
				 */
				rf_DispatchDAG(dag_h, (void (*) (void *))
				    rf_ContinueDagAccess, &dagArray[i]);
			}
		}
		return RF_TRUE;
	}
}
示例#14
0
int
rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
{
	/*
	 * The situation:
	 *
	 * We are doing a write that hits only one failed data unit. The other
	 * failed data unit is not being overwritten, so we need to generate
	 * it.
	 *
	 * For the moment, we assume all the nonfailed data being written is in
	 * the shadow of the failed data unit. (i.e., either a single data
	 * unit write or the entire failed stripe unit is being overwritten.)
	 *
	 * Recovery strategy: apply the recovery data to the parity and Q.
	 * Use P & Q to recover the second failed data unit in P. Zero fill
	 * Q, then apply the recovered data to P. Then apply the data being
	 * written to the failed drive. Then walk through the surviving drives,
	 * applying new data when it exists, othewise the recovery data.
	 * Quite a mess.
	 *
	 *
	 * The params:
	 *
	 *   read pda0, read pda1, ..., read pda (numDataCol-3),
	 *   write pda0, ..., write pda (numStripeUnitAccess - numDataFailed),
	 *   failed pda, raidPtr, asmap
	 */

	int np = node->numParams;
	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
	    node->params[np - 1].p;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
	int i;
	RF_RaidAddr_t sosAddr;
	unsigned coeff;
	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
	int numDataCol = layoutPtr->numDataCol;
	RF_Etimer_t timer;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;

	RF_ASSERT(node->numResults == 2);
	RF_ASSERT(asmap->failedPDAs[1] == NULL);
	RF_ETIMER_START(timer);
	ppda = node->results[0];
	qpda = node->results[1];
	/* apply the recovery data */
	for (i = 0; i < numDataCol - 2; i++)
		rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
		    node->dagHdr->bp);

	/* Determine the other failed data unit. */
	pda = asmap->failedPDAs[0];
	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
	    asmap->raidAddress);
	/* Need to determine the column of the other failed disk. */
	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
	/* Compute the data unit offset within the column. */
	coeff = (coeff % raidPtr->Layout.numDataCol);
	for (i = 0; i < numDataCol; i++) {
		npda.raidAddress = sosAddr + (i * secPerSU);
		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
		    &(npda.row), &(npda.col), &(npda.startSector), 0);
		/* Skip over dead disks. */
		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
			if (i != coeff)
				break;
	}
	RF_ASSERT(i < numDataCol);
	/*
	 * Recover the data. The column we want to recover, we write over the
	 * parity. The column we don't care about, we dump in q.
	 */
	if (coeff < i)		/* Recovering 'a'. */
		rf_PQ_recover((unsigned long *) ppda->bufPtr,
		    (unsigned long *) qpda->bufPtr,
		    (unsigned long *) ppda->bufPtr,
		    (unsigned long *) qpda->bufPtr,
		    rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
	else			/* Recovering 'b'. */
		rf_PQ_recover((unsigned long *) ppda->bufPtr,
		    (unsigned long *) qpda->bufPtr,
		    (unsigned long *) qpda->bufPtr,
		    (unsigned long *) ppda->bufPtr,
		    rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);

	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
	bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr,
	    rf_RaidAddressToByte(raidPtr, qpda->numSector), i);

	/* Now apply all the write data to the buffer. */
	/*
	 * Single stripe unit write case: The failed data is the only thing
	 * we are writing.
	 */
	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
	/* Dest, src, len, coeff. */
	rf_IncQ((unsigned long *) qpda->bufPtr,
	    (unsigned long *) asmap->failedPDAs[0]->bufPtr,
	    rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr,
	    rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);

	/* Now apply all the recovery data. */
	for (i = 0; i < numDataCol - 2; i++)
		rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
		    node->dagHdr->bp);

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	if (tracerec)
		tracerec->q_us += RF_ETIMER_VAL_US(timer);

	rf_GenericWakeupFunc(node, 0);
	return (0);
}
示例#15
0
int
rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
{
	int np = node->numParams;
	RF_AccessStripeMap_t *asmap =
	    (RF_AccessStripeMap_t *) node->params[np - 1].p;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
	int d, i;
	unsigned coeff;
	RF_RaidAddr_t sosAddr, suoffset;
	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
	int two = 0;
	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
	char *buf;
	int numDataCol = layoutPtr->numDataCol;
	RF_Etimer_t timer;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;

	RF_ETIMER_START(timer);

	if (asmap->failedPDAs[1] &&
	    (asmap->failedPDAs[1]->numSector +
	     asmap->failedPDAs[0]->numSector < secPerSU)) {
		RF_ASSERT(0);
		ppda = node->params[np - 6].p;
		ppda2 = node->params[np - 5].p;
		qpda = node->params[np - 4].p;
		qpda2 = node->params[np - 3].p;
		d = (np - 6);
		two = 1;
	} else {
		ppda = node->params[np - 4].p;
		qpda = node->params[np - 3].p;
		d = (np - 4);
	}

	for (i = 0; i < d; i++) {
		pda = node->params[i].p;
		buf = pda->bufPtr;
		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
		len = pda->numSector;
		coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
		    pda->raidAddress);
		/* Compute the data unit offset within the column. */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		/* See if pda intersects a recovery pda. */
		rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
		if (two)
			rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
	}

	/*
	 * Ok, we got the parity back to the point where we can recover. We
	 * now need to determine the coeff of the columns that need to be
	 * recovered. We can also only need to recover a single stripe unit.
	 */

	if (asmap->failedPDAs[1] == NULL) {	/*
						 * Only a single stripe unit
						 * to recover.
						 */
		pda = asmap->failedPDAs[0];
		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
		    asmap->raidAddress);
		/* Need to determine the column of the other failed disk. */
		coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
		    pda->raidAddress);
		/* Compute the data unit offset within the column. */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		for (i = 0; i < numDataCol; i++) {
			npda.raidAddress = sosAddr + (i * secPerSU);
			(raidPtr->Layout.map->MapSector) (raidPtr,
			    npda.raidAddress, &(npda.row), &(npda.col),
			    &(npda.startSector), 0);
			/* Skip over dead disks. */
			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col]
			    .status))
				if (i != coeff)
					break;
		}
		RF_ASSERT(i < numDataCol);
		RF_ASSERT(two == 0);
		/*
		 * Recover the data. Since we need only to recover one
		 * column, we overwrite the parity with the other one.
		 */
		if (coeff < i)	/* Recovering 'a'. */
			rf_PQ_recover((unsigned long *) ppda->bufPtr,
			    (unsigned long *) qpda->bufPtr,
			    (unsigned long *) pda->bufPtr,
			    (unsigned long *) ppda->bufPtr,
			    rf_RaidAddressToByte(raidPtr, pda->numSector),
			    coeff, i);
		else		/* Recovering 'b'. */
			rf_PQ_recover((unsigned long *) ppda->bufPtr,
			    (unsigned long *) qpda->bufPtr,
			    (unsigned long *) ppda->bufPtr,
			    (unsigned long *) pda->bufPtr,
			    rf_RaidAddressToByte(raidPtr, pda->numSector),
			    i, coeff);
	} else
		RF_PANIC();

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	if (tracerec)
		tracerec->q_us += RF_ETIMER_VAL_US(timer);
	rf_GenericWakeupFunc(node, 0);
	return (0);
}
示例#16
0
RF_ReconEvent_t *
rf_GetNextReconEvent(RF_RaidReconDesc_t *reconDesc)
{
	RF_Raid_t *raidPtr = reconDesc->raidPtr;
	RF_ReconCtrl_t *rctrl = raidPtr->reconControl;
	RF_ReconEvent_t *event;
	int stall_count;

	RF_LOCK_MUTEX(rctrl->eq_mutex);
	/* q null and count==0 must be equivalent conditions */
	RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));

	/* mpsleep timeout value: secs = timo_val/hz.  'ticks' here is
	   defined as cycle-counter ticks, not softclock ticks */

#define MAX_RECON_EXEC_USECS (100 * 1000)  /* 100 ms */
#define RECON_DELAY_MS 25
#define RECON_TIMO     ((RECON_DELAY_MS * hz) / 1000)

	/* we are not pre-emptible in the kernel, but we don't want to run
	 * forever.  If we run w/o blocking for more than MAX_RECON_EXEC_TICKS
	 * ticks of the cycle counter, delay for RECON_DELAY before
	 * continuing. this may murder us with context switches, so we may
	 * need to increase both the MAX...TICKS and the RECON_DELAY_MS. */
	if (reconDesc->reconExecTimerRunning) {
		int     status;

		RF_ETIMER_STOP(reconDesc->recon_exec_timer);
		RF_ETIMER_EVAL(reconDesc->recon_exec_timer);
		reconDesc->reconExecTicks +=
			RF_ETIMER_VAL_US(reconDesc->recon_exec_timer);
		if (reconDesc->reconExecTicks > reconDesc->maxReconExecTicks)
			reconDesc->maxReconExecTicks =
				reconDesc->reconExecTicks;
		if (reconDesc->reconExecTicks >= MAX_RECON_EXEC_USECS) {
			/* we've been running too long.  delay for
			 * RECON_DELAY_MS */
#if RF_RECON_STATS > 0
			reconDesc->numReconExecDelays++;
#endif				/* RF_RECON_STATS > 0 */

			status = ltsleep(&reconDesc->reconExecTicks, PRIBIO,
					 "recon delay", RECON_TIMO,
					 &rctrl->eq_mutex);
			RF_ASSERT(status == EWOULDBLOCK);
			reconDesc->reconExecTicks = 0;
		}
	}

	stall_count = 0;
	while (!rctrl->eventQueue) {
#if RF_RECON_STATS > 0
		reconDesc->numReconEventWaits++;
#endif				/* RF_RECON_STATS > 0 */

		ltsleep(&(rctrl)->eventQueue, PRIBIO,  "raidframe eventq",
			RF_EVENTQ_WAIT, &((rctrl)->eq_mutex));

		stall_count++;

		if ((stall_count > 10) && 
		    rctrl->headSepCBList) {
			/* There is work to do on the callback list, and
			   we've waited long enough... */
			rf_WakeupHeadSepCBWaiters(raidPtr);
			stall_count = 0;
		}
		reconDesc->reconExecTicks = 0;	/* we've just waited */
	}

	reconDesc->reconExecTimerRunning = 1;
	if (RF_ETIMER_VAL_US(reconDesc->recon_exec_timer)!=0) {
		/* it moved!!  reset the timer. */
		RF_ETIMER_START(reconDesc->recon_exec_timer);
	}
	event = rctrl->eventQueue;
	rctrl->eventQueue = event->next;
	event->next = NULL;
	rctrl->eq_count--;

	/* q null and count==0 must be equivalent conditions */
	RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));
	RF_UNLOCK_MUTEX(rctrl->eq_mutex);
	return (event);
}