Beispiel #1
0
void
rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr,
				      RF_AccessStripeMap_t *asmap,
				      RF_DagHeader_t *dag_h, void *bp,
				      RF_RaidAccessFlags_t flags,
				      RF_AllocListElem_t *allocList,
				      int nfaults,
				      int (*redFunc) (RF_DagNode_t *),
				      int allowBufferRecycle)
{
	int     nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
	        rdnodesFaked;
	RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
	RF_DagNode_t *wndNodes, *rrdNodes, *xorNode, *commitNode;
	RF_DagNode_t *tmpNode, *tmpwndNode, *tmprrdNode;
	RF_SectorCount_t sectorsPerSU;
	RF_ReconUnitNum_t which_ru;
	char   *xorTargetBuf = NULL;	/* the target buffer for the XOR
					 * operation */
	char   overlappingPDAs[RF_MAXCOL];/* a temporary array of flags */
	RF_AccessStripeMapHeader_t *new_asm_h[2];
	RF_PhysDiskAddr_t *pda, *parityPDA;
	RF_StripeNum_t parityStripeID;
	RF_PhysDiskAddr_t *failedPDA;
	RF_RaidLayout_t *layoutPtr;

	layoutPtr = &(raidPtr->Layout);
	parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
	    &which_ru);
	sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
	/* failedPDA points to the pda within the asm that targets the failed
	 * disk */
	failedPDA = asmap->failedPDAs[0];

#if RF_DEBUG_DAG
	if (rf_dagDebug)
		printf("[Creating degraded-write DAG]\n");
#endif

	RF_ASSERT(asmap->numDataFailed == 1);
	dag_h->creator = "SimpleDegradedWriteDAG";

	/*
         * Generate two ASMs identifying the surviving data
         * we need in order to recover the lost data.
         */
	/* overlappingPDAs array must be zero'd */
	memset(overlappingPDAs, 0, RF_MAXCOL);
	rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
	    &nXorBufs, NULL, overlappingPDAs, allocList);

	/* create all the nodes at once */
	nWndNodes = asmap->numStripeUnitsAccessed - 1;	/* no access is
							 * generated for the
							 * failed pda */

	nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
	    ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
	/*
         * XXX
         *
         * There's a bug with a complete stripe overwrite- that means 0 reads
         * of old data, and the rest of the DAG generation code doesn't like
         * that. A release is coming, and I don't wanna risk breaking a critical
         * DAG generator, so here's what I'm gonna do- if there's no read nodes,
         * I'm gonna fake there being a read node, and I'm gonna swap in a
         * no-op node in its place (to make all the link-up code happy).
         * This should be fixed at some point.  --jimz
         */
	if (nRrdNodes == 0) {
		nRrdNodes = 1;
		rdnodesFaked = 1;
	} else {
		rdnodesFaked = 0;
	}
	/* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
	nNodes = 5 + nfaults + nWndNodes + nRrdNodes;

	blockNode = rf_AllocDAGNode();
	blockNode->list_next = dag_h->nodes;
	dag_h->nodes = blockNode;

	commitNode = rf_AllocDAGNode();
	commitNode->list_next = dag_h->nodes;
	dag_h->nodes = commitNode;

	unblockNode = rf_AllocDAGNode();
	unblockNode->list_next = dag_h->nodes;
	dag_h->nodes = unblockNode;

	termNode = rf_AllocDAGNode();
	termNode->list_next = dag_h->nodes;
	dag_h->nodes = termNode;

	xorNode = rf_AllocDAGNode();
	xorNode->list_next = dag_h->nodes;
	dag_h->nodes = xorNode;

	wnpNode = rf_AllocDAGNode();
	wnpNode->list_next = dag_h->nodes;
	dag_h->nodes = wnpNode;

	for (i = 0; i < nWndNodes; i++) {
		tmpNode = rf_AllocDAGNode();
		tmpNode->list_next = dag_h->nodes;
		dag_h->nodes = tmpNode;
	}
	wndNodes = dag_h->nodes;

	for (i = 0; i < nRrdNodes; i++) {
		tmpNode = rf_AllocDAGNode();
		tmpNode->list_next = dag_h->nodes;
		dag_h->nodes = tmpNode;
	}
	rrdNodes = dag_h->nodes;

#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
	if (nfaults == 2) {
		wnqNode = rf_AllocDAGNode();
		wnqNode->list_next = dag_h->nodes;
		dag_h->nodes = wnqNode;
	} else {
#endif
		wnqNode = NULL;
#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
	}
#endif

	/* this dag can not commit until all rrd and xor Nodes have completed */
	dag_h->numCommitNodes = 1;
	dag_h->numCommits = 0;
	dag_h->numSuccedents = 1;

	RF_ASSERT(nRrdNodes > 0);
	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
	    NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
	    NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
	    NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
	    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
	rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
	    nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);

	/*
         * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
         * the failed buffer, save a pointer to it so we can use it as the target
         * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
         * a buffer is the same size as the failed buffer, it must also be at the
         * same alignment within the SU.
         */
	i = 0;
	tmprrdNode = rrdNodes;
	if (new_asm_h[0]) {
		for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
		    i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
		    i++, pda = pda->next) {
			rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
			RF_ASSERT(pda);
			tmprrdNode->params[0].p = pda;
			tmprrdNode->params[1].p = pda->bufPtr;
			tmprrdNode->params[2].v = parityStripeID;
			tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
			tmprrdNode = tmprrdNode->list_next;
		}
	}
	/* i now equals the number of stripe units accessed in new_asm_h[0] */
	/* Note that for tmprrdNode, this means a continuation from above, so no need to
	   assign it anything.. */
	if (new_asm_h[1]) {
		for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
		    j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
		    j++, pda = pda->next) {
			rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
			RF_ASSERT(pda);
			tmprrdNode->params[0].p = pda;
			tmprrdNode->params[1].p = pda->bufPtr;
			tmprrdNode->params[2].v = parityStripeID;
			tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
			if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
				xorTargetBuf = pda->bufPtr;
			tmprrdNode = tmprrdNode->list_next;
		}
	}
	if (rdnodesFaked) {
		/*
	         * This is where we'll init that fake noop read node
	         * (XXX should the wakeup func be different?)
	         */
		/* node that rrdNodes will just be a single node... */
		rf_InitNode(rrdNodes, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
		    NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
	}
	/*
         * Make a PDA for the parity unit.  The parity PDA should start at
         * the same offset into the SU as the failed PDA.
         */
	/* Danner comment: I don't think this copy is really necessary. We are
	 * in one of two cases here. (1) The entire failed unit is written.
	 * Then asmap->parityInfo will describe the entire parity. (2) We are
	 * only writing a subset of the failed unit and nothing else. Then the
	 * asmap->parityInfo describes the failed unit and the copy can also
	 * be avoided. */

	parityPDA = rf_AllocPhysDiskAddr();
	parityPDA->next = dag_h->pda_cleanup_list;
	dag_h->pda_cleanup_list = parityPDA;
	parityPDA->col = asmap->parityInfo->col;
	parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
	    * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
	parityPDA->numSector = failedPDA->numSector;

	if (!xorTargetBuf) {
		xorTargetBuf = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
	}
	/* init the Wnp node */
	rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
	    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
	wnpNode->params[0].p = parityPDA;
	wnpNode->params[1].p = xorTargetBuf;
	wnpNode->params[2].v = parityStripeID;
	wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);

#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
	/* fill in the Wnq Node */
	if (nfaults == 2) {
		{
			RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
			    (RF_PhysDiskAddr_t *), allocList);
			parityPDA->col = asmap->qInfo->col;
			parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
			    * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
			parityPDA->numSector = failedPDA->numSector;

			rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
			wnqNode->params[0].p = parityPDA;
			RF_MallocAndAdd(xorNode->results[1],
			    rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
			wnqNode->params[1].p = xorNode->results[1];
			wnqNode->params[2].v = parityStripeID;
			wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
		}
void
rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
			     RF_DagHeader_t *dag_h, void *bp,
			     RF_RaidAccessFlags_t flags,
			     RF_AllocListElem_t *allocList,
			     int nfaults, int (*redFunc) (RF_DagNode_t *),
			     int allowBufferRecycle)
{
	RF_DagNode_t *wndNodes, *rodNodes, *xorNode, *wnpNode, *tmpNode;
	RF_DagNode_t *blockNode, *commitNode, *termNode;
#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
	RF_DagNode_t *wnqNode;
#endif
	int     nWndNodes, nRodNodes, i, nodeNum, asmNum;
	RF_AccessStripeMapHeader_t *new_asm_h[2];
	RF_StripeNum_t parityStripeID;
	char   *sosBuffer, *eosBuffer;
	RF_ReconUnitNum_t which_ru;
	RF_RaidLayout_t *layoutPtr;
	RF_PhysDiskAddr_t *pda;

	layoutPtr = &(raidPtr->Layout);
	parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
							asmap->raidAddress,
							&which_ru);

#if RF_DEBUG_DAG
	if (rf_dagDebug) {
		printf("[Creating large-write DAG]\n");
	}
#endif
	dag_h->creator = "LargeWriteDAG";

	dag_h->numCommitNodes = 1;
	dag_h->numCommits = 0;
	dag_h->numSuccedents = 1;

	/* alloc the nodes: Wnd, xor, commit, block, term, and  Wnp */
	nWndNodes = asmap->numStripeUnitsAccessed;

	for (i = 0; i < nWndNodes; i++) {
		tmpNode = rf_AllocDAGNode();
		tmpNode->list_next = dag_h->nodes;
		dag_h->nodes = tmpNode;
	}
	wndNodes = dag_h->nodes;

	xorNode = rf_AllocDAGNode();
	xorNode->list_next = dag_h->nodes;
	dag_h->nodes = xorNode;

	wnpNode = rf_AllocDAGNode();
	wnpNode->list_next = dag_h->nodes;
	dag_h->nodes = wnpNode;

	blockNode = rf_AllocDAGNode();
	blockNode->list_next = dag_h->nodes;
	dag_h->nodes = blockNode;

	commitNode = rf_AllocDAGNode();
	commitNode->list_next = dag_h->nodes;
	dag_h->nodes = commitNode;

	termNode = rf_AllocDAGNode();
	termNode->list_next = dag_h->nodes;
	dag_h->nodes = termNode;

#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
	if (nfaults == 2) {
		wnqNode = rf_AllocDAGNode();
	} else {
		wnqNode = NULL;
	}
#endif
	rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h,
					new_asm_h, &nRodNodes, &sosBuffer,
					&eosBuffer, allocList);
	if (nRodNodes > 0) {
		for (i = 0; i < nRodNodes; i++) {
			tmpNode = rf_AllocDAGNode();
			tmpNode->list_next = dag_h->nodes;
			dag_h->nodes = tmpNode;
		}
		rodNodes = dag_h->nodes;
	} else {
		rodNodes = NULL;
	}

	/* begin node initialization */
	if (nRodNodes > 0) {
		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
			    rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0,
			    dag_h, "Nil", allocList);
	} else {
		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
			    rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0,
			    dag_h, "Nil", allocList);
	}

	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
		    rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0,
		    dag_h, "Cmt", allocList);
	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
		    rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0,
		    dag_h, "Trm", allocList);

	/* initialize the Rod nodes */
	tmpNode = rodNodes;
	for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
		if (new_asm_h[asmNum]) {
			pda = new_asm_h[asmNum]->stripeMap->physInfo;
			while (pda) {
				rf_InitNode(tmpNode, rf_wait,
					    RF_FALSE, rf_DiskReadFunc,
					    rf_DiskReadUndoFunc,
					    rf_GenericWakeupFunc,
					    1, 1, 4, 0, dag_h,
					    "Rod", allocList);
				tmpNode->params[0].p = pda;
				tmpNode->params[1].p = pda->bufPtr;
				tmpNode->params[2].v = parityStripeID;
				tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
				    which_ru);
				nodeNum++;
				pda = pda->next;
				tmpNode = tmpNode->list_next;
			}
		}
	}
	RF_ASSERT(nodeNum == nRodNodes);

	/* initialize the wnd nodes */
	pda = asmap->physInfo;
	tmpNode = wndNodes;
	for (i = 0; i < nWndNodes; i++) {
		rf_InitNode(tmpNode, rf_wait, RF_FALSE,
			    rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
			    rf_GenericWakeupFunc, 1, 1, 4, 0,
			    dag_h, "Wnd", allocList);
		RF_ASSERT(pda != NULL);
		tmpNode->params[0].p = pda;
		tmpNode->params[1].p = pda->bufPtr;
		tmpNode->params[2].v = parityStripeID;
		tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
		pda = pda->next;
		tmpNode = tmpNode->list_next;
	}

	/* initialize the redundancy node */
	if (nRodNodes > 0) {
		rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
			    rf_NullNodeUndoFunc, NULL, 1,
			    nRodNodes, 2 * (nWndNodes + nRodNodes) + 1,
			    nfaults, dag_h, "Xr ", allocList);
	} else {
		rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
			    rf_NullNodeUndoFunc, NULL, 1,
			    1, 2 * (nWndNodes + nRodNodes) + 1,
			    nfaults, dag_h, "Xr ", allocList);
	}
	xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
	tmpNode = wndNodes;
	for (i = 0; i < nWndNodes; i++) {
		/* pda */
		xorNode->params[2 * i + 0] = tmpNode->params[0];
		/* buf ptr */
		xorNode->params[2 * i + 1] = tmpNode->params[1];
		tmpNode = tmpNode->list_next;
	}
	tmpNode = rodNodes;
	for (i = 0; i < nRodNodes; i++) {
		/* pda */
		xorNode->params[2 * (nWndNodes + i) + 0] = tmpNode->params[0];
		/* buf ptr */
		xorNode->params[2 * (nWndNodes + i) + 1] = tmpNode->params[1];
		tmpNode = tmpNode->list_next;
	}
	/* xor node needs to get at RAID information */
	xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;

	/*
         * Look for an Rod node that reads a complete SU. If none,
         * alloc a buffer to receive the parity info. Note that we
         * can't use a new data buffer because it will not have gotten
         * written when the xor occurs.  */
	if (allowBufferRecycle) {
		tmpNode = rodNodes;
		for (i = 0; i < nRodNodes; i++) {
			if (((RF_PhysDiskAddr_t *) tmpNode->params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
				break;
			tmpNode = tmpNode->list_next;
		}
	}
	if ((!allowBufferRecycle) || (i == nRodNodes)) {
		xorNode->results[0] = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit));
	} else {
		/* this works because the only way we get here is if
		   allowBufferRecycle is true and we went through the
		   above for loop, and exited via the break before
		   i==nRodNodes was true.  That means tmpNode will
		   still point to a valid node -- the one we want for
		   here! */
		xorNode->results[0] = tmpNode->params[1].p;
	}

	/* initialize the Wnp node */
	rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
		    rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
		    dag_h, "Wnp", allocList);
	wnpNode->params[0].p = asmap->parityInfo;
	wnpNode->params[1].p = xorNode->results[0];
	wnpNode->params[2].v = parityStripeID;
	wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
	/* parityInfo must describe entire parity unit */
	RF_ASSERT(asmap->parityInfo->next == NULL);

#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
	if (nfaults == 2) {
		/*
	         * We never try to recycle a buffer for the Q calcuation
	         * in addition to the parity. This would cause two buffers
	         * to get smashed during the P and Q calculation, guaranteeing
	         * one would be wrong.
	         */
		RF_MallocAndAdd(xorNode->results[1],
				rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
				(void *), allocList);
		rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
			    rf_DiskWriteUndoFunc, rf_GenericWakeupFunc,
			    1, 1, 4, 0, dag_h, "Wnq", allocList);
		wnqNode->params[0].p = asmap->qInfo;
		wnqNode->params[1].p = xorNode->results[1];
		wnqNode->params[2].v = parityStripeID;
		wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
		/* parityInfo must describe entire parity unit */
		RF_ASSERT(asmap->parityInfo->next == NULL);
	}