/*******************************************************************************************
 * This degraded function allow only two case:
 *  1. when write access the full failed stripe unit, then the access can be more than
 *     one tripe units.
 *  2. when write access only part of the failed SU, we assume accesses of more than
 *     one stripe unit is not allowed so that the write can be dealt with like a
 *     large write.
 *  The following function is based on these assumptions. So except in the second case,
 *  it looks the same as a large write encodeing function. But this is not exactly the
 *  normal way for doing a degraded write, since raidframe have to break cases of access
 *  other than the above two into smaller accesses. We may have to change
 *  DegrESubroutin in the future.
 *******************************************************************************************/
void
rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
	RF_PhysDiskAddr_t *pda;
	int     i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
	RF_RowCol_t scol;
	char   *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;

	RF_ETIMER_START(timer);
	for (i = 0; i < node->numParams - 2; i += 2) {
		RF_ASSERT(node->params[i + 1].p != ebuf);
		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
		scol = rf_EUCol(layoutPtr, pda->raidAddress);
		srcbuf = (char *) node->params[i + 1].p;
		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
}
/* Algorithm:
     1. Store the difference of old data and new data in the Rod buffer.
     2. then encode this buffer into the buffer which already have old 'E' information inside it,
	the result can be shown to be the new 'E' information.
     3. xor the Wnd buffer into the difference buffer to recover the  original old data.
   Here we have another alternative: to allocate a temporary buffer for storing the difference of
   old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
   take the same speed as the previous, and need more memory.
*/
int
rf_RegularONEFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	int     EpdaIndex = (node->numParams - 1) / 2 - 1;	/* the parameter of node
								 * where you can find
								 * e-pda */
	int     i, k;
	int     suoffset, length;
	RF_RowCol_t scol;
	char   *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;
	RF_PhysDiskAddr_t *pda;
#ifdef RAID_DIAGNOSTIC
	RF_PhysDiskAddr_t *EPDA =
	    (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
	int     ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector);

	RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
	RF_ASSERT(ESUOffset == 0);
#endif /* RAID_DIAGNOSTIC */

	RF_ETIMER_START(timer);

	/* Xor the Wnd buffer into Rod buffer, the difference of old data and
	 * new data is stored in Rod buffer */
	for (k = 0; k < EpdaIndex; k += 2) {
		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
		rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
	}
	/* Start to encoding the buffer storing the difference of old data and
	 * new data into 'E' buffer  */
	for (i = 0; i < EpdaIndex; i += 2)
		if (node->params[i + 1].p != node->results[0]) {	/* results[0] is buf ptr
									 * of E */
			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
			srcbuf = (char *) node->params[i + 1].p;
			scol = rf_EUCol(layoutPtr, pda->raidAddress);
			suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
			destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset);
			rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
		}
	/* Recover the original old data to be used by parity encoding
	 * function in XorNode */
	for (k = 0; k < EpdaIndex; k += 2) {
		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
		rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
	}
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
	rf_GenericWakeupFunc(node, 0);
#if 1
	return (0);		/* XXX this was missing.. GO */
#endif
}
Exemple #3
0
/*
 * The following three states create, execute, and post-process DAGs.
 * The error recovery unit is a single DAG.
 * By default, SelectAlgorithm creates an array of DAGs, one per parity stripe.
 * In some tricky cases, multiple dags per stripe are created.
 *   - DAGs within a parity stripe are executed sequentially (arbitrary order).
 *   - DAGs for distinct parity stripes are executed concurrently.
 *
 * Repeat until all DAGs complete successfully -or- DAG selection fails.
 *
 * while !done
 *   create dag(s) (SelectAlgorithm)
 *   if dag
 *     execute dag (DispatchDAG)
 *     if dag successful
 *       done (SUCCESS)
 *     else
 *       !done (RETRY - start over with new dags)
 *   else
 *     done (FAIL)
 */
int
rf_State_CreateDAG(RF_RaidAccessDesc_t *desc)
{
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
	RF_DagHeader_t *dag_h;
	int i, selectStatus;

	/*
	 * Generate a dag for the access, and fire it off. When the dag
	 * completes, we'll get re-invoked in the next state.
	 */
	RF_ETIMER_START(timer);
	/* SelectAlgorithm returns one or more dags. */
	selectStatus = rf_SelectAlgorithm(desc,
	    desc->flags | RF_DAG_SUPPRESS_LOCKS);
	if (rf_printDAGsDebug)
		for (i = 0; i < desc->numStripes; i++)
			rf_PrintDAGList(desc->dagArray[i].dags);
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	/* Update time to create all dags. */
	tracerec->specific.user.dag_create_us = RF_ETIMER_VAL_US(timer);

	desc->status = 0;	/* Good status. */

	if (selectStatus) {
		/* Failed to create a dag. */
		/*
		 * This happens when there are too many faults or incomplete
		 * dag libraries.
		 */
		printf("[Failed to create a DAG]\n");
		RF_PANIC();
	} else {
		/* Bind dags to desc. */
		for (i = 0; i < desc->numStripes; i++) {
			dag_h = desc->dagArray[i].dags;
			while (dag_h) {
				dag_h->bp = (struct buf *) desc->bp;
				dag_h->tracerec = tracerec;
				dag_h = dag_h->next;
			}
		}
		desc->flags |= RF_DAG_DISPATCH_RETURNED;
		desc->state++;	/* Next state should be rf_State_ExecuteDAG. */
	}
	return RF_FALSE;
}
void
rf_ContinueDagAccess(RF_DagList_t *dagList)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &(dagList->desc->tracerec);
	RF_Etimer_t timer;
#endif
	RF_RaidAccessDesc_t *desc;
	RF_DagHeader_t *dag_h;
	int     i;

	desc = dagList->desc;

#if RF_ACC_TRACE > 0
	timer = tracerec->timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.exec_us = RF_ETIMER_VAL_US(timer);
	RF_ETIMER_START(tracerec->timer);
#endif

	/* skip to dag which just finished */
	dag_h = dagList->dags;
	for (i = 0; i < dagList->numDagsDone; i++) {
		dag_h = dag_h->next;
	}

	/* check to see if retry is required */
	if (dag_h->status == rf_rollBackward) {
		/* when a dag fails, mark desc status as bad and allow
		 * all other dags in the desc to execute to
		 * completion.  then, free all dags and start over */
		desc->status = 1;	/* bad status */
#if 0
		printf("raid%d: DAG failure: %c addr 0x%lx "
		       "(%ld) nblk 0x%x (%d) buf 0x%lx state %d\n",
		       desc->raidPtr->raidid, desc->type,
		       (long) desc->raidAddress,
		       (long) desc->raidAddress, (int) desc->numBlocks,
		       (int) desc->numBlocks,
		       (unsigned long) (desc->bufPtr), desc->state);
#endif
	}
	dagList->numDagsDone++;
	rf_ContinueRaidAccess(desc);
}
Exemple #5
0
void
rf_ContinueDagAccess(RF_DagList_t *dagList)
{
	RF_AccTraceEntry_t *tracerec = &(dagList->desc->tracerec);
	RF_RaidAccessDesc_t *desc;
	RF_DagHeader_t *dag_h;
	RF_Etimer_t timer;
	int i;

	desc = dagList->desc;

	timer = tracerec->timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.exec_us = RF_ETIMER_VAL_US(timer);
	RF_ETIMER_START(tracerec->timer);

	/* Skip to dag which just finished. */
	dag_h = dagList->dags;
	for (i = 0; i < dagList->numDagsDone; i++) {
		dag_h = dag_h->next;
	}

	/* Check to see if retry is required. */
	if (dag_h->status == rf_rollBackward) {
		/*
		 * When a dag fails, mark desc status as bad and allow all
		 * other dags in the desc to execute to completion. Then,
		 * free all dags and start over.
		 */
		desc->status = 1;	/* Bad status. */
		{
			printf("raid%d: DAG failure: %c addr 0x%lx (%ld)"
			       " nblk 0x%x (%d) buf 0x%lx.\n",
			       desc->raidPtr->raidid, desc->type,
			       (long) desc->raidAddress,
			       (long) desc->raidAddress,
			       (int) desc->numBlocks, (int) desc->numBlocks,
			       (unsigned long) (desc->bufPtr));
		}
	}
	dagList->numDagsDone++;
	rf_ContinueRaidAccess(desc);
}
Exemple #6
0
int
rf_State_DecrAccessCount(RF_RaidAccessDesc_t *desc)
{
	RF_Raid_t *raidPtr;

	raidPtr = desc->raidPtr;

	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
	raidPtr->accs_in_flight--;
	if (raidPtr->accesses_suspended && raidPtr->accs_in_flight == 0) {
		rf_SignalQuiescenceLock(raidPtr, raidPtr->reconDesc);
	}
	rf_UpdateUserStats(raidPtr, RF_ETIMER_VAL_US(desc->timer),
	    desc->numBlocks);
	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);

	desc->state++;
	return RF_FALSE;
}
Exemple #7
0
int
rf_State_Map(RF_RaidAccessDesc_t *desc)
{
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;

	RF_ETIMER_START(timer);

	if (!(desc->asmap = rf_MapAccess(raidPtr, desc->raidAddress,
	     desc->numBlocks, desc->bufPtr, RF_DONT_REMAP)))
		RF_PANIC();

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.map_us = RF_ETIMER_VAL_US(timer);

	desc->state++;
	return RF_FALSE;
}
Exemple #8
0
int
rf_State_Quiesce(RF_RaidAccessDesc_t *desc)
{
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
	int suspended = RF_FALSE;
	RF_Raid_t *raidPtr;

	raidPtr = desc->raidPtr;

	RF_ETIMER_START(timer);
	RF_ETIMER_START(desc->timer);

	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
	if (raidPtr->accesses_suspended) {
		RF_CallbackDesc_t *cb;
		cb = rf_AllocCallbackDesc();
		/*
		 * XXX The following cast is quite bogus...
		 * rf_ContinueRaidAccess takes a (RF_RaidAccessDesc_t *)
		 * as an argument... GO
		 */
		cb->callbackFunc = (void (*) (RF_CBParam_t))
		    rf_ContinueRaidAccess;
		cb->callbackArg.p = (void *) desc;
		cb->next = raidPtr->quiesce_wait_list;
		raidPtr->quiesce_wait_list = cb;
		suspended = RF_TRUE;
	}
	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.suspend_ovhd_us += RF_ETIMER_VAL_US(timer);

	if (suspended && rf_quiesceDebug)
		printf("Stalling access due to quiescence lock.\n");

	desc->state++;
	return suspended;
}
/**************************************************************************************
 * when parity die and one data die, We use second redundant information, 'E',
 * to recover the data in dead disk. This function is used in the recovery node of
 * for EO_110_CreateReadDAG
 **************************************************************************************/
int
rf_RecoveryEFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
	RF_RowCol_t scol,	/* source logical column */
	        fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress);	/* logical column of
									 * failed SU */
	int     i;
	RF_PhysDiskAddr_t *pda;
	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
	char   *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;

	memset((char *) node->results[0], 0,
	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
	if (node->dagHdr->status == rf_enable) {
		RF_ETIMER_START(timer);
		for (i = 0; i < node->numParams - 2; i += 2)
			if (node->params[i + 1].p != node->results[0]) {
				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
				if (i == node->numParams - 4)
					scol = RF_EO_MATRIX_DIM - 2;	/* the colume of
									 * redundant E */
				else
					scol = rf_EUCol(layoutPtr, pda->raidAddress);
				srcbuf = (char *) node->params[i + 1].p;
				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
				rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
			}
		RF_ETIMER_STOP(timer);
		RF_ETIMER_EVAL(timer);
		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
	}
	return (rf_GenericWakeupFunc(node, 0));	/* node execute successfully */
}
int
rf_SimpleONEFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
	int     retcode = 0;
	char   *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	int     length;
	RF_RowCol_t scol;
	RF_Etimer_t timer;

	RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q);
	if (node->dagHdr->status == rf_enable) {
		RF_ETIMER_START(timer);
		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector);	/* this is a pda of
														 * writeDataNodes */
		/* bxor to buffer of readDataNodes */
		retcode = rf_bxor(node->params[5].p, node->params[1].p, length);
		/* find out the corresponding colume in encoding matrix for
		 * write colume to be encoded into redundant disk 'E' */
		scol = rf_EUCol(layoutPtr, pda->raidAddress);
		srcbuf = node->params[1].p;
		destbuf = node->params[3].p;
		/* Start encoding process */
		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
		rf_bxor(node->params[5].p, node->params[1].p, length);
		RF_ETIMER_STOP(timer);
		RF_ETIMER_EVAL(timer);
		tracerec->q_us += RF_ETIMER_VAL_US(timer);

	}
	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
							 * explicitly since no
							 * I/O in this node */
}
Exemple #11
0
int
rf_ParityLogAppend(RF_ParityLogData_t *logData, int finish,
    RF_ParityLog_t **incomingLog, int clearReintFlag)
{
	int regionID, logItem, itemDone;
	RF_ParityLogData_t *item;
	int punt, done = RF_FALSE;
	RF_ParityLog_t *log;
	RF_Raid_t *raidPtr;
	RF_Etimer_t timer;
	int (*wakeFunc) (RF_DagNode_t * node, int status);
	void *wakeArg;

	/*
	 * Add parity to the appropriate log, one sector at a time. This
	 * routine is called is called by dag functions ParityLogUpdateFunc
	 * and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
	 *
	 * Parity to be logged is contained in a linked-list (logData). When
	 * this routine returns, every sector in the list will be in one of
	 * three places: 1) entered into the parity log 2) queued, waiting on
	 * reintegration 3) queued, waiting on a core log.
	 *
	 * Blocked work is passed to the ParityLoggingDiskManager for
	 * completion. Later, as conditions which required the block are
	 * removed, the work reenters this routine with the "finish" parameter
	 * set to "RF_TRUE."
	 *
	 * NON-BLOCKING
	 */

	raidPtr = logData->common->raidPtr;
	/* Lock the region for the first item in logData. */
	RF_ASSERT(logData != NULL);
	regionID = logData->regionID;
	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
	RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);

	if (clearReintFlag) {
		/*
		 * Enable flushing for this region. Holding both locks
		 * provides a synchronization barrier with
		 * rf_DumpParityLogToDisk.
		 */
		RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
		RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress ==
		    RF_TRUE);
		raidPtr->regionInfo[regionID].diskCount = 0;
		raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
		/* Flushing is now enabled. */
		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
	}
	/* Process each item in logData. */
	while (logData) {
		/* Remove an item from logData. */
		item = logData;
		logData = logData->next;
		item->next = NULL;
		item->prev = NULL;

		if (rf_parityLogDebug)
			printf("[appending parity log data, region %d,"
			    " raidAddress %d, numSector %d]\n", item->regionID,
			    (int) item->diskAddress.raidAddress,
			    (int) item->diskAddress.numSector);

		/* See if we moved to a new region. */
		if (regionID != item->regionID) {
			RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
			regionID = item->regionID;
			RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
			RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
		}
		punt = RF_FALSE;/*
				 * Set to RF_TRUE if work is blocked. This
				 * can happen in one of two ways: 1) no core
				 * log (rf_AcquireParityLog) 2) waiting on
				 * reintegration (rf_DumpParityLogToDisk).
				 * If punt is RF_TRUE, the dataItem was queued,
				 * so skip to next item.
				 */

		/*
		 * Process item, one sector at a time, until all sectors
		 * processed or we punt.
		 */
		if (item->diskAddress.numSector > 0)
			done = RF_FALSE;
		else
			RF_ASSERT(0);
		while (!punt && !done) {
			/* Verify that a core log exists for this region. */
			if (!raidPtr->regionInfo[regionID].coreLog) {
				/*
				 * Attempt to acquire a parity log. If
				 * acquisition fails, queue remaining work in
				 * data item and move to nextItem.
				 */
				if (incomingLog) {
					if (*incomingLog) {
						RF_ASSERT((*incomingLog)->next
						    == NULL);
						raidPtr->regionInfo[regionID]
						    .coreLog = *incomingLog;
						raidPtr->regionInfo[regionID]
						    .coreLog->regionID =
						     regionID;
						*incomingLog = NULL;
					} else
						raidPtr->regionInfo[regionID]
						    .coreLog =
						     rf_AcquireParityLog(item,
						      finish);
				} else
					raidPtr->regionInfo[regionID].coreLog =
					    rf_AcquireParityLog(item, finish);
				/*
				 * Note: rf_AcquireParityLog either returns
				 * a log or enqueues currentItem.
				 */
			}
			if (!raidPtr->regionInfo[regionID].coreLog)
				punt = RF_TRUE;	/* Failed to find a core log. */
			else {
				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog
				    ->next == NULL);
				/*
				 * Verify that the log has room for new
				 * entries.
				 */
				/*
				 * If log is full, dump it to disk and grab a
				 * new log.
				 */
				if (raidPtr->regionInfo[regionID].coreLog
				    ->numRecords == raidPtr->numSectorsPerLog)
				{
					/* Log is full, dump it to disk. */
					if (rf_DumpParityLogToDisk(finish,
					    item))
						/*
						 * Dump unsuccessful, blocked
						 * on reintegration.
						 */
						punt = RF_TRUE;
					else {
						/* Dump was successful. */
					  if (incomingLog) {
							if (*incomingLog) {
								RF_ASSERT(
							(*incomingLog)->next ==
								    NULL);
								raidPtr->
						regionInfo[regionID].coreLog =
								   *incomingLog;
								raidPtr->
						regionInfo[regionID].coreLog->
							    regionID = regionID;
								*incomingLog =
								    NULL;
							} else
								raidPtr->
						regionInfo[regionID].coreLog =
						 rf_AcquireParityLog(item,
						     finish);
						} else
							raidPtr->regionInfo
							    [regionID].coreLog =
						 rf_AcquireParityLog(item,
						     finish);
						/*
						 * If a core log is not
						 * available, must queue work
						 * and return.
						 */
						if (!raidPtr->regionInfo
						    [regionID].coreLog)
							/*
							 * Blocked on log
							 * availability.
							 */
							punt = RF_TRUE;
					}
				}
			}
			/*
			 * If we didn't punt on this item, attempt to add a
			 * sector to the core log.
			 */
			if (!punt) {
				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog
				    ->next == NULL);
				/*
				 * At this point, we have a core log with
				 * enough room for a sector.
				 */
				/* Copy a sector into the log. */
				log = raidPtr->regionInfo[regionID].coreLog;
				RF_ASSERT(log->numRecords <
				    raidPtr->numSectorsPerLog);
				logItem = log->numRecords++;
				log->records[logItem].parityAddr =
				    item->diskAddress;
				RF_ASSERT(log->records[logItem].parityAddr
				    .startSector >=
				    raidPtr->regionInfo[regionID]
				    .parityStartAddr);
				RF_ASSERT(log->records[logItem].parityAddr
				    .startSector <
				    raidPtr->regionInfo[regionID]
				    .parityStartAddr +
				    raidPtr->regionInfo[regionID]
				    .numSectorsParity);
				log->records[logItem].parityAddr.numSector = 1;
				log->records[logItem].operation =
				    item->common->operation;
				bcopy((item->common->bufPtr +
				    (item->bufOffset++ * (1 <<
				    item->common->raidPtr->logBytesPerSector))),
				    log->bufPtr + (logItem * (1 <<
				    item->common->raidPtr->logBytesPerSector)),
				    (1 << item->common->raidPtr
				     ->logBytesPerSector));
				item->diskAddress.numSector--;
				item->diskAddress.startSector++;
				if (item->diskAddress.numSector == 0)
					done = RF_TRUE;
			}
		}

		if (!punt) {
			/*
			 * Processed this item completely, decrement count of
			 * items to be processed.
			 */
			RF_ASSERT(item->diskAddress.numSector == 0);
			RF_LOCK_MUTEX(item->common->mutex);
			item->common->cnt--;
			if (item->common->cnt == 0)
				itemDone = RF_TRUE;
			else
				itemDone = RF_FALSE;
			RF_UNLOCK_MUTEX(item->common->mutex);
			if (itemDone) {
				/*
				 * Finished processing all log data for this
				 * IO Return structs to free list and invoke
				 * wakeup function.
				 */
				/* Grab initial value of timer. */
				timer = item->common->startTime;
				RF_ETIMER_STOP(timer);
				RF_ETIMER_EVAL(timer);
				item->common->tracerec->plog_us +=
				    RF_ETIMER_VAL_US(timer);
				if (rf_parityLogDebug)
					printf("[waking process for region"
					    " %d]\n", item->regionID);
				wakeFunc = item->common->wakeFunc;
				wakeArg = item->common->wakeArg;
				rf_FreeParityLogCommonData(item->common);
				rf_FreeParityLogData(item);
				(wakeFunc) (wakeArg, 0);
			} else
				rf_FreeParityLogData(item);
		}
	}
	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
	if (rf_parityLogDebug)
		printf("[exiting ParityLogAppend]\n");
	return (0);
}
/*
 * the following three states create, execute, and post-process dags
 * the error recovery unit is a single dag.
 * by default, SelectAlgorithm creates an array of dags, one per parity stripe
 * in some tricky cases, multiple dags per stripe are created
 *   - dags within a parity stripe are executed sequentially (arbitrary order)
 *   - dags for distinct parity stripes are executed concurrently
 *
 * repeat until all dags complete successfully -or- dag selection fails
 *
 * while !done
 *   create dag(s) (SelectAlgorithm)
 *   if dag
 *     execute dag (DispatchDAG)
 *     if dag successful
 *       done (SUCCESS)
 *     else
 *       !done (RETRY - start over with new dags)
 *   else
 *     done (FAIL)
 */
int
rf_State_CreateDAG(RF_RaidAccessDesc_t *desc)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
#endif
	RF_DagHeader_t *dag_h;
	RF_DagList_t *dagList;
	struct buf *bp;
	int     i, selectStatus;

	/* generate a dag for the access, and fire it off.  When the dag
	 * completes, we'll get re-invoked in the next state. */
#if RF_ACC_TRACE > 0
	RF_ETIMER_START(timer);
#endif
	/* SelectAlgorithm returns one or more dags */
	selectStatus = rf_SelectAlgorithm(desc, desc->flags | RF_DAG_SUPPRESS_LOCKS);
#if RF_DEBUG_VALIDATE_DAG
	if (rf_printDAGsDebug) {
		dagList = desc->dagList;
		for (i = 0; i < desc->numStripes; i++) {
			rf_PrintDAGList(dagList->dags);
			dagList = dagList->next;
		}
	}
#endif /* RF_DEBUG_VALIDATE_DAG */
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	/* update time to create all dags */
	tracerec->specific.user.dag_create_us = RF_ETIMER_VAL_US(timer);
#endif

	desc->status = 0;	/* good status */

	if (selectStatus || (desc->numRetries > RF_RETRY_THRESHOLD)) {
		/* failed to create a dag */
		/* this happens when there are too many faults or incomplete
		 * dag libraries */
		if (selectStatus) {
			printf("raid%d: failed to create a dag. "
			       "Too many component failures.\n",
			       desc->raidPtr->raidid);
		} else {
			printf("raid%d: IO failed after %d retries.\n",
			       desc->raidPtr->raidid, RF_RETRY_THRESHOLD);
		}

		desc->status = 1; /* bad status */
		/* skip straight to rf_State_Cleanup() */
		desc->state = rf_CleanupState;
		bp = (struct buf *)desc->bp;
		bp->b_error = EIO;
		bp->b_resid = bp->b_bcount;
	} else {
		/* bind dags to desc */
		dagList = desc->dagList;
		for (i = 0; i < desc->numStripes; i++) {
			dag_h = dagList->dags;
			while (dag_h) {
				dag_h->bp = (struct buf *) desc->bp;
#if RF_ACC_TRACE > 0
				dag_h->tracerec = tracerec;
#endif
				dag_h = dag_h->next;
			}
			dagList = dagList->next;
		}
		desc->flags |= RF_DAG_DISPATCH_RETURNED;
		desc->state++;	/* next state should be rf_State_ExecuteDAG */
	}
	return RF_FALSE;
}
Exemple #13
0
int
rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
{
	/*
	 * The situation:
	 *
	 * We are doing a write that hits only one failed data unit. The other
	 * failed data unit is not being overwritten, so we need to generate
	 * it.
	 *
	 * For the moment, we assume all the nonfailed data being written is in
	 * the shadow of the failed data unit. (i.e., either a single data
	 * unit write or the entire failed stripe unit is being overwritten.)
	 *
	 * Recovery strategy: apply the recovery data to the parity and Q.
	 * Use P & Q to recover the second failed data unit in P. Zero fill
	 * Q, then apply the recovered data to P. Then apply the data being
	 * written to the failed drive. Then walk through the surviving drives,
	 * applying new data when it exists, othewise the recovery data.
	 * Quite a mess.
	 *
	 *
	 * The params:
	 *
	 *   read pda0, read pda1, ..., read pda (numDataCol-3),
	 *   write pda0, ..., write pda (numStripeUnitAccess - numDataFailed),
	 *   failed pda, raidPtr, asmap
	 */

	int np = node->numParams;
	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
	    node->params[np - 1].p;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
	int i;
	RF_RaidAddr_t sosAddr;
	unsigned coeff;
	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
	int numDataCol = layoutPtr->numDataCol;
	RF_Etimer_t timer;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;

	RF_ASSERT(node->numResults == 2);
	RF_ASSERT(asmap->failedPDAs[1] == NULL);
	RF_ETIMER_START(timer);
	ppda = node->results[0];
	qpda = node->results[1];
	/* apply the recovery data */
	for (i = 0; i < numDataCol - 2; i++)
		rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
		    node->dagHdr->bp);

	/* Determine the other failed data unit. */
	pda = asmap->failedPDAs[0];
	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
	    asmap->raidAddress);
	/* Need to determine the column of the other failed disk. */
	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
	/* Compute the data unit offset within the column. */
	coeff = (coeff % raidPtr->Layout.numDataCol);
	for (i = 0; i < numDataCol; i++) {
		npda.raidAddress = sosAddr + (i * secPerSU);
		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
		    &(npda.row), &(npda.col), &(npda.startSector), 0);
		/* Skip over dead disks. */
		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
			if (i != coeff)
				break;
	}
	RF_ASSERT(i < numDataCol);
	/*
	 * Recover the data. The column we want to recover, we write over the
	 * parity. The column we don't care about, we dump in q.
	 */
	if (coeff < i)		/* Recovering 'a'. */
		rf_PQ_recover((unsigned long *) ppda->bufPtr,
		    (unsigned long *) qpda->bufPtr,
		    (unsigned long *) ppda->bufPtr,
		    (unsigned long *) qpda->bufPtr,
		    rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
	else			/* Recovering 'b'. */
		rf_PQ_recover((unsigned long *) ppda->bufPtr,
		    (unsigned long *) qpda->bufPtr,
		    (unsigned long *) qpda->bufPtr,
		    (unsigned long *) ppda->bufPtr,
		    rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);

	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
	bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr,
	    rf_RaidAddressToByte(raidPtr, qpda->numSector), i);

	/* Now apply all the write data to the buffer. */
	/*
	 * Single stripe unit write case: The failed data is the only thing
	 * we are writing.
	 */
	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
	/* Dest, src, len, coeff. */
	rf_IncQ((unsigned long *) qpda->bufPtr,
	    (unsigned long *) asmap->failedPDAs[0]->bufPtr,
	    rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr,
	    rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);

	/* Now apply all the recovery data. */
	for (i = 0; i < numDataCol - 2; i++)
		rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
		    node->dagHdr->bp);

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	if (tracerec)
		tracerec->q_us += RF_ETIMER_VAL_US(timer);

	rf_GenericWakeupFunc(node, 0);
	return (0);
}
Exemple #14
0
int
rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
{
	int np = node->numParams;
	RF_AccessStripeMap_t *asmap =
	    (RF_AccessStripeMap_t *) node->params[np - 1].p;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
	int d, i;
	unsigned coeff;
	RF_RaidAddr_t sosAddr, suoffset;
	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
	int two = 0;
	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
	char *buf;
	int numDataCol = layoutPtr->numDataCol;
	RF_Etimer_t timer;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;

	RF_ETIMER_START(timer);

	if (asmap->failedPDAs[1] &&
	    (asmap->failedPDAs[1]->numSector +
	     asmap->failedPDAs[0]->numSector < secPerSU)) {
		RF_ASSERT(0);
		ppda = node->params[np - 6].p;
		ppda2 = node->params[np - 5].p;
		qpda = node->params[np - 4].p;
		qpda2 = node->params[np - 3].p;
		d = (np - 6);
		two = 1;
	} else {
		ppda = node->params[np - 4].p;
		qpda = node->params[np - 3].p;
		d = (np - 4);
	}

	for (i = 0; i < d; i++) {
		pda = node->params[i].p;
		buf = pda->bufPtr;
		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
		len = pda->numSector;
		coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
		    pda->raidAddress);
		/* Compute the data unit offset within the column. */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		/* See if pda intersects a recovery pda. */
		rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
		if (two)
			rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
	}

	/*
	 * Ok, we got the parity back to the point where we can recover. We
	 * now need to determine the coeff of the columns that need to be
	 * recovered. We can also only need to recover a single stripe unit.
	 */

	if (asmap->failedPDAs[1] == NULL) {	/*
						 * Only a single stripe unit
						 * to recover.
						 */
		pda = asmap->failedPDAs[0];
		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
		    asmap->raidAddress);
		/* Need to determine the column of the other failed disk. */
		coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
		    pda->raidAddress);
		/* Compute the data unit offset within the column. */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		for (i = 0; i < numDataCol; i++) {
			npda.raidAddress = sosAddr + (i * secPerSU);
			(raidPtr->Layout.map->MapSector) (raidPtr,
			    npda.raidAddress, &(npda.row), &(npda.col),
			    &(npda.startSector), 0);
			/* Skip over dead disks. */
			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col]
			    .status))
				if (i != coeff)
					break;
		}
		RF_ASSERT(i < numDataCol);
		RF_ASSERT(two == 0);
		/*
		 * Recover the data. Since we need only to recover one
		 * column, we overwrite the parity with the other one.
		 */
		if (coeff < i)	/* Recovering 'a'. */
			rf_PQ_recover((unsigned long *) ppda->bufPtr,
			    (unsigned long *) qpda->bufPtr,
			    (unsigned long *) pda->bufPtr,
			    (unsigned long *) ppda->bufPtr,
			    rf_RaidAddressToByte(raidPtr, pda->numSector),
			    coeff, i);
		else		/* Recovering 'b'. */
			rf_PQ_recover((unsigned long *) ppda->bufPtr,
			    (unsigned long *) qpda->bufPtr,
			    (unsigned long *) ppda->bufPtr,
			    (unsigned long *) pda->bufPtr,
			    rf_RaidAddressToByte(raidPtr, pda->numSector),
			    i, coeff);
	} else
		RF_PANIC();

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	if (tracerec)
		tracerec->q_us += RF_ETIMER_VAL_US(timer);
	rf_GenericWakeupFunc(node, 0);
	return (0);
}
int
rf_State_Quiesce(RF_RaidAccessDesc_t *desc)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
#endif
	RF_CallbackDesc_t *cb;
	RF_Raid_t *raidPtr;
	int     suspended = RF_FALSE;
	int need_cb, used_cb;

	raidPtr = desc->raidPtr;

#if RF_ACC_TRACE > 0
	RF_ETIMER_START(timer);
	RF_ETIMER_START(desc->timer);
#endif

	need_cb = 0;
	used_cb = 0;
	cb = NULL;

	rf_lock_mutex2(raidPtr->access_suspend_mutex);
	/* Do an initial check to see if we might need a callback structure */
	if (raidPtr->accesses_suspended) {
		need_cb = 1;
	}
	rf_unlock_mutex2(raidPtr->access_suspend_mutex);

	if (need_cb) {
		/* create a callback if we might need it...
		   and we likely do. */
		cb = rf_AllocCallbackDesc();
	}

	rf_lock_mutex2(raidPtr->access_suspend_mutex);
	if (raidPtr->accesses_suspended) {
		cb->callbackFunc = (void (*) (RF_CBParam_t)) rf_ContinueRaidAccess;
		cb->callbackArg.p = (void *) desc;
		cb->next = raidPtr->quiesce_wait_list;
		raidPtr->quiesce_wait_list = cb;
		suspended = RF_TRUE;
		used_cb = 1;
	}
	rf_unlock_mutex2(raidPtr->access_suspend_mutex);

	if ((need_cb == 1) && (used_cb == 0)) {
		rf_FreeCallbackDesc(cb);
	}

#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.suspend_ovhd_us += RF_ETIMER_VAL_US(timer);
#endif

#if RF_DEBUG_QUIESCE
	if (suspended && rf_quiesceDebug)
		printf("Stalling access due to quiescence lock\n");
#endif
	desc->state++;
	return suspended;
}
int
rf_State_Lock(RF_RaidAccessDesc_t *desc)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
#endif
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
	RF_AccessStripeMap_t *asm_p;
	RF_StripeNum_t lastStripeID = -1;
	int     suspended = RF_FALSE;

#if RF_ACC_TRACE > 0
	RF_ETIMER_START(timer);
#endif

	/* acquire each lock that we don't already hold */
	for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
		RF_ASSERT(RF_IO_IS_R_OR_W(desc->type));
		if (!rf_suppressLocksAndLargeWrites &&
		    asm_p->parityInfo &&
		    !(desc->flags & RF_DAG_SUPPRESS_LOCKS) &&
		    !(asm_p->flags & RF_ASM_FLAGS_LOCK_TRIED)) {
			asm_p->flags |= RF_ASM_FLAGS_LOCK_TRIED;
				/* locks must be acquired hierarchically */
			RF_ASSERT(asm_p->stripeID > lastStripeID);
			lastStripeID = asm_p->stripeID;

			RF_INIT_LOCK_REQ_DESC(asm_p->lockReqDesc, desc->type,
					      (void (*) (struct buf *)) rf_ContinueRaidAccess, desc, asm_p,
					      raidPtr->Layout.dataSectorsPerStripe);
			if (rf_AcquireStripeLock(raidPtr->lockTable, asm_p->stripeID,
						 &asm_p->lockReqDesc)) {
				suspended = RF_TRUE;
				break;
			}
		}
		if (desc->type == RF_IO_TYPE_WRITE &&
		    raidPtr->status == rf_rs_reconstructing) {
			if (!(asm_p->flags & RF_ASM_FLAGS_FORCE_TRIED)) {
				int     val;

				asm_p->flags |= RF_ASM_FLAGS_FORCE_TRIED;
				val = rf_ForceOrBlockRecon(raidPtr, asm_p,
							   (void (*) (RF_Raid_t *, void *)) rf_ContinueRaidAccess, desc);
				if (val == 0) {
					asm_p->flags |= RF_ASM_FLAGS_RECON_BLOCKED;
				} else {
					suspended = RF_TRUE;
					break;
				}
			} else {
#if RF_DEBUG_PSS > 0
				if (rf_pssDebug) {
					printf("raid%d: skipping force/block because already done, psid %ld\n",
					       desc->raidPtr->raidid,
					       (long) asm_p->stripeID);
				}
#endif
			}
		} else {
#if RF_DEBUG_PSS > 0
			if (rf_pssDebug) {
				printf("raid%d: skipping force/block because not write or not under recon, psid %ld\n",
				       desc->raidPtr->raidid,
				       (long) asm_p->stripeID);
			}
#endif
		}
	}
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);
#endif
	if (suspended)
		return (RF_TRUE);

	desc->state++;
	return (RF_FALSE);
}
Exemple #17
0
/* Only make it this far if all dags complete successfully. */
int
rf_State_Cleanup(RF_RaidAccessDesc_t *desc)
{
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_AccessStripeMap_t *asm_p;
	RF_DagHeader_t *dag_h;
	RF_Etimer_t timer;
	int i;

	desc->state++;

	timer = tracerec->timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.dag_retry_us = RF_ETIMER_VAL_US(timer);

	/* The RAID I/O is complete. Clean up. */
	tracerec->specific.user.dag_retry_us = 0;

	RF_ETIMER_START(timer);
	if (desc->flags & RF_DAG_RETURN_DAG) {
		/* Copy dags into paramDAG. */
		*(desc->paramDAG) = desc->dagArray[0].dags;
		dag_h = *(desc->paramDAG);
		for (i = 1; i < desc->numStripes; i++) {
			/* Concatenate dags from remaining stripes. */
			RF_ASSERT(dag_h);
			while (dag_h->next)
				dag_h = dag_h->next;
			dag_h->next = desc->dagArray[i].dags;
		}
	} else {
		/* Free all dags. */
		for (i = 0; i < desc->numStripes; i++) {
			rf_FreeDAG(desc->dagArray[i].dags);
		}
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.cleanup_us = RF_ETIMER_VAL_US(timer);

	RF_ETIMER_START(timer);
	if (!(raidPtr->Layout.map->flags & RF_NO_STRIPE_LOCKS)) {
		for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
			if (!rf_suppressLocksAndLargeWrites &&
			    asm_p->parityInfo &&
			    !(desc->flags & RF_DAG_SUPPRESS_LOCKS)) {
				RF_ASSERT_VALID_LOCKREQ(&asm_p->lockReqDesc);
				rf_ReleaseStripeLock(raidPtr->lockTable,
				    asm_p->stripeID, &asm_p->lockReqDesc);
			}
			if (asm_p->flags & RF_ASM_FLAGS_RECON_BLOCKED) {
				rf_UnblockRecon(raidPtr, asm_p);
			}
		}
	}
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);

	RF_ETIMER_START(timer);
	if (desc->flags & RF_DAG_RETURN_ASM)
		*(desc->paramASM) = asmh;
	else
		rf_FreeAccessStripeMap(asmh);
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.cleanup_us += RF_ETIMER_VAL_US(timer);

	RF_ETIMER_STOP(desc->timer);
	RF_ETIMER_EVAL(desc->timer);

	timer = desc->tracerec.tot_timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	desc->tracerec.total_us = RF_ETIMER_VAL_US(timer);

	rf_LogTraceRec(raidPtr, tracerec);

	desc->flags |= RF_DAG_ACCESS_COMPLETE;

	return RF_FALSE;
}
/* only make it this far if all dags complete successfully */
int
rf_State_Cleanup(RF_RaidAccessDesc_t *desc)
{
#if RF_ACC_TRACE > 0
	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
	RF_Etimer_t timer;
#endif
	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
	RF_Raid_t *raidPtr = desc->raidPtr;
	RF_AccessStripeMap_t *asm_p;
	RF_DagList_t *dagList;
	int i;

	desc->state++;

#if RF_ACC_TRACE > 0
	timer = tracerec->timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.dag_retry_us = RF_ETIMER_VAL_US(timer);

	/* the RAID I/O is complete.  Clean up. */
	tracerec->specific.user.dag_retry_us = 0;

	RF_ETIMER_START(timer);
#endif
	/* free all dags */
	dagList = desc->dagList;
	for (i = 0; i < desc->numStripes; i++) {
		rf_FreeDAG(dagList->dags);
		dagList = dagList->next;
	}
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.cleanup_us = RF_ETIMER_VAL_US(timer);

	RF_ETIMER_START(timer);
#endif
	for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
		if (!rf_suppressLocksAndLargeWrites &&
		    asm_p->parityInfo &&
		    !(desc->flags & RF_DAG_SUPPRESS_LOCKS)) {
			RF_ASSERT_VALID_LOCKREQ(&asm_p->lockReqDesc);
			rf_ReleaseStripeLock(raidPtr->lockTable,
					     asm_p->stripeID,
					     &asm_p->lockReqDesc);
		}
		if (asm_p->flags & RF_ASM_FLAGS_RECON_BLOCKED) {
			rf_UnblockRecon(raidPtr, asm_p);
		}
	}
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);

	RF_ETIMER_START(timer);
#endif
	rf_FreeAccessStripeMap(asmh);
#if RF_ACC_TRACE > 0
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->specific.user.cleanup_us += RF_ETIMER_VAL_US(timer);

	RF_ETIMER_STOP(desc->timer);
	RF_ETIMER_EVAL(desc->timer);

	timer = desc->tracerec.tot_timer;
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	desc->tracerec.total_us = RF_ETIMER_VAL_US(timer);

	rf_LogTraceRec(raidPtr, tracerec);
#endif
	desc->flags |= RF_DAG_ACCESS_COMPLETE;

	return RF_FALSE;
}
Exemple #19
0
RF_ReconEvent_t *
rf_GetNextReconEvent(RF_RaidReconDesc_t *reconDesc)
{
	RF_Raid_t *raidPtr = reconDesc->raidPtr;
	RF_ReconCtrl_t *rctrl = raidPtr->reconControl;
	RF_ReconEvent_t *event;
	int stall_count;

	RF_LOCK_MUTEX(rctrl->eq_mutex);
	/* q null and count==0 must be equivalent conditions */
	RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));

	/* mpsleep timeout value: secs = timo_val/hz.  'ticks' here is
	   defined as cycle-counter ticks, not softclock ticks */

#define MAX_RECON_EXEC_USECS (100 * 1000)  /* 100 ms */
#define RECON_DELAY_MS 25
#define RECON_TIMO     ((RECON_DELAY_MS * hz) / 1000)

	/* we are not pre-emptible in the kernel, but we don't want to run
	 * forever.  If we run w/o blocking for more than MAX_RECON_EXEC_TICKS
	 * ticks of the cycle counter, delay for RECON_DELAY before
	 * continuing. this may murder us with context switches, so we may
	 * need to increase both the MAX...TICKS and the RECON_DELAY_MS. */
	if (reconDesc->reconExecTimerRunning) {
		int     status;

		RF_ETIMER_STOP(reconDesc->recon_exec_timer);
		RF_ETIMER_EVAL(reconDesc->recon_exec_timer);
		reconDesc->reconExecTicks +=
			RF_ETIMER_VAL_US(reconDesc->recon_exec_timer);
		if (reconDesc->reconExecTicks > reconDesc->maxReconExecTicks)
			reconDesc->maxReconExecTicks =
				reconDesc->reconExecTicks;
		if (reconDesc->reconExecTicks >= MAX_RECON_EXEC_USECS) {
			/* we've been running too long.  delay for
			 * RECON_DELAY_MS */
#if RF_RECON_STATS > 0
			reconDesc->numReconExecDelays++;
#endif				/* RF_RECON_STATS > 0 */

			status = ltsleep(&reconDesc->reconExecTicks, PRIBIO,
					 "recon delay", RECON_TIMO,
					 &rctrl->eq_mutex);
			RF_ASSERT(status == EWOULDBLOCK);
			reconDesc->reconExecTicks = 0;
		}
	}

	stall_count = 0;
	while (!rctrl->eventQueue) {
#if RF_RECON_STATS > 0
		reconDesc->numReconEventWaits++;
#endif				/* RF_RECON_STATS > 0 */

		ltsleep(&(rctrl)->eventQueue, PRIBIO,  "raidframe eventq",
			RF_EVENTQ_WAIT, &((rctrl)->eq_mutex));

		stall_count++;

		if ((stall_count > 10) && 
		    rctrl->headSepCBList) {
			/* There is work to do on the callback list, and
			   we've waited long enough... */
			rf_WakeupHeadSepCBWaiters(raidPtr);
			stall_count = 0;
		}
		reconDesc->reconExecTicks = 0;	/* we've just waited */
	}

	reconDesc->reconExecTimerRunning = 1;
	if (RF_ETIMER_VAL_US(reconDesc->recon_exec_timer)!=0) {
		/* it moved!!  reset the timer. */
		RF_ETIMER_START(reconDesc->recon_exec_timer);
	}
	event = rctrl->eventQueue;
	rctrl->eventQueue = event->next;
	event->next = NULL;
	rctrl->eq_count--;

	/* q null and count==0 must be equivalent conditions */
	RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));
	RF_UNLOCK_MUTEX(rctrl->eq_mutex);
	return (event);
}