void CMAny::calcLastPos(CMStateSet& toSet) const { // If we are an epsilon node, then the last pos is an empty set if (isNullable()) toSet.zeroBits(); // Otherwise, its just the one bit of our position else toSet.setBit(fPosition); }
// --------------------------------------------------------------------------- // Implementation of protected CMNode virtual interface // --------------------------------------------------------------------------- void CMAny::calcFirstPos(CMStateSet& toSet) const { // If we are an epsilon node, then the first pos is an empty set if (fPosition == -1) toSet.zeroBits(); else // Otherwise, its just the one bit of our position toSet.setBit(fPosition); return; }
inline void CMLeaf::calcLastPos(CMStateSet& toSet) const { // If we are an epsilon node, then the last pos is an empty set if (fPosition == -1) { toSet.zeroBits(); return; } // Otherwise, its just the one bit of our position toSet.setBit(fPosition); }
// --------------------------------------------------------------------------- // DFAContentModel: Private helper methods // --------------------------------------------------------------------------- void DFAContentModel::buildDFA(ContentSpecNode* const curNode) { unsigned int index; // // The first step we need to take is to rewrite the content model using // our CMNode objects, and in the process get rid of any repetition short // cuts, converting them into '*' style repetitions or getting rid of // repetitions altogether. // // The conversions done are: // // x+ -> (x|x*) // x? -> (x|epsilon) // // This is a relatively complex scenario. What is happening is that we // create a top level binary node of which the special EOC value is set // as the right side node. The the left side is set to the rewritten // syntax tree. The source is the original content model info from the // decl pool. The rewrite is done by buildSyntaxTree() which recurses the // decl pool's content of the element and builds a new tree in the // process. // // Note that, during this operation, we set each non-epsilon leaf node's // DFA state position and count the number of such leafs, which is left // in the fLeafCount member. // CMLeaf* nodeEOC = new (fMemoryManager) CMLeaf ( new (fMemoryManager) QName ( XMLUni::fgZeroLenString , XMLUni::fgZeroLenString , XMLContentModel::gEOCFakeId , fMemoryManager ) , ~0 , true , fMemoryManager ); CMNode* nodeOrgContent = buildSyntaxTree(curNode); fHeadNode = new (fMemoryManager) CMBinaryOp ( ContentSpecNode::Sequence , nodeOrgContent , nodeEOC , fMemoryManager ); // // And handle specially the EOC node, which also must be numbered and // counted as a non-epsilon leaf node. It could not be handled in the // above tree build because it was created before all that started. We // save the EOC position since its used during the DFA building loop. // fEOCPos = fLeafCount; nodeEOC->setPosition(fLeafCount++); // // Ok, so now we have to iterate the new tree and do a little more work // now that we know the leaf count. One thing we need to do is to // calculate the first and last position sets of each node. This is // cached away in each of the nodes. // // Along the way we also set the leaf count in each node as the maximum // state count. They must know this in order to create their first/last // position sets. // // We also need to build an array of references to the non-epsilon // leaf nodes. Since we iterate here the same way as we did during the // initial tree build (which built their position numbers, we will put // them in the array according to their position values. // fLeafList = (CMLeaf**) fMemoryManager->allocate(fLeafCount*sizeof(CMLeaf*)); //new CMLeaf*[fLeafCount]; fLeafListType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate ( fLeafCount * sizeof(ContentSpecNode::NodeTypes) ); //new ContentSpecNode::NodeTypes[fLeafCount]; postTreeBuildInit(fHeadNode, 0); // // And, moving onward... We now need to build the follow position sets // for all the nodes. So we allocate an array of pointers to state sets, // one for each leaf node (i.e. each significant DFA position.) // fFollowList = (CMStateSet**) fMemoryManager->allocate ( fLeafCount * sizeof(CMStateSet*) ); //new CMStateSet*[fLeafCount]; for (index = 0; index < fLeafCount; index++) fFollowList[index] = new (fMemoryManager) CMStateSet(fLeafCount, fMemoryManager); calcFollowList(fHeadNode); // // Check to see whether this content model can handle an empty content, // which is something we need to optimize by looking now before we // throw away the info that would tell us that. // // If the left node of the head (the top level of the original content) // is nullable, then its true. // fEmptyOk = nodeOrgContent->isNullable(); // // And finally the big push... Now we build the DFA using all the states // and the tree we've built up. First we set up the various data // structures we are going to use while we do this. // // First of all we need an array of unique element ids in our content // model. For each transition table entry, we need a set of contiguous // indices to represent the transitions for a particular input element. // So we need to a zero based range of indexes that map to element types. // This element map provides that mapping. // fElemMap = (QName**) fMemoryManager->allocate ( fLeafCount * sizeof(QName*) ); //new QName*[fLeafCount]; fElemMapType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate ( fLeafCount * sizeof(ContentSpecNode::NodeTypes) ); //new ContentSpecNode::NodeTypes[fLeafCount]; fElemMapSize = 0; for (unsigned int outIndex = 0; outIndex < fLeafCount; outIndex++) { fElemMap[outIndex] = new (fMemoryManager) QName(fMemoryManager); if ( (fLeafListType[outIndex] & 0x0f) != ContentSpecNode::Leaf ) if (!fLeafNameTypeVector) fLeafNameTypeVector = new (fMemoryManager) ContentLeafNameTypeVector(fMemoryManager); // Get the current leaf's element index const QName* element = fLeafList[outIndex]->getElement(); const XMLCh* elementRawName = 0; if (fDTD && element) elementRawName = element->getRawName(); // See if the current leaf node's element index is in the list unsigned int inIndex = 0; for (; inIndex < fElemMapSize; inIndex++) { const QName* inElem = fElemMap[inIndex]; if (fDTD) { if (XMLString::equals(inElem->getRawName(), elementRawName)) { break; } } else { if ((fElemMapType[inIndex] == fLeafListType[outIndex]) && (inElem->getURI() == element->getURI()) && (XMLString::equals(inElem->getLocalPart(), element->getLocalPart()))) { break; } } } // If it was not in the list, then add it and bump the map size if (inIndex == fElemMapSize) { fElemMap[fElemMapSize]->setValues(*element); fElemMapType[fElemMapSize] = fLeafListType[outIndex]; ++fElemMapSize; } } // set up the fLeafNameTypeVector object if there is one. if (fLeafNameTypeVector) { fLeafNameTypeVector->setValues(fElemMap, fElemMapType, fElemMapSize); } /*** * Optimization(Jan, 2001); We sort fLeafList according to * elemIndex which is *uniquely* associated to each leaf. * We are *assuming* that each element appears in at least one leaf. **/ // don't forget to delete it int *fLeafSorter = (int*) fMemoryManager->allocate ( (fLeafCount + fElemMapSize) * sizeof(int) ); //new int[fLeafCount + fElemMapSize]; unsigned int fSortCount = 0; for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) { const QName* element = fElemMap[elemIndex]; const XMLCh* elementRawName = 0; if (fDTD && element) elementRawName = element->getRawName(); for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) { const QName* leaf = fLeafList[leafIndex]->getElement(); if (fDTD) { if (XMLString::equals(leaf->getRawName(), elementRawName)) { fLeafSorter[fSortCount++] = leafIndex; } } else { if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) && (leaf->getURI() == element->getURI()) && (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) { fLeafSorter[fSortCount++] = leafIndex; } } } fLeafSorter[fSortCount++] = -1; } // // Next lets create some arrays, some that that hold transient info // during the DFA build and some that are permament. These are kind of // sticky since we cannot know how big they will get, but we don't want // to use any collection type classes because of performance. // // Basically they will probably be about fLeafCount*2 on average, but can // be as large as 2^(fLeafCount*2), worst case. So we start with // fLeafCount*4 as a middle ground. This will be very unlikely to ever // have to expand though, it if does, the overhead will be somewhat ugly. // unsigned int curArraySize = fLeafCount * 4; const CMStateSet** statesToDo = (const CMStateSet**) fMemoryManager->allocate ( curArraySize * sizeof(const CMStateSet*) ); //new const CMStateSet*[curArraySize]; fFinalStateFlags = (bool*) fMemoryManager->allocate ( curArraySize * sizeof(bool) ); //new bool[curArraySize]; fTransTable = (unsigned int**) fMemoryManager->allocate ( curArraySize * sizeof(unsigned int*) ); //new unsigned int*[curArraySize]; // // Ok we start with the initial set as the first pos set of the head node // (which is the seq node that holds the content model and the EOC node.) // const CMStateSet* setT = new (fMemoryManager) CMStateSet(fHeadNode->getFirstPos()); // // Init our two state flags. Basically the unmarked state counter is // always chasing the current state counter. When it catches up, that // means we made a pass through that did not add any new states to the // lists, at which time we are done. We could have used a expanding array // of flags which we used to mark off states as we complete them, but // this is easier though less readable maybe. // unsigned int unmarkedState = 0; unsigned int curState = 0; // // Init the first transition table entry, and put the initial state // into the states to do list, then bump the current state. // fTransTable[curState] = makeDefStateList(); statesToDo[curState] = setT; curState++; // // the stateTable is an auxiliary means to fast // identification of new state created (instead // of squential loop statesToDo to find out), // while the role that statesToDo plays remain unchanged. // // TODO: in the future, we may change the 29 to something // derived from curArraySize. RefHashTableOf<XMLInteger> *stateTable = new (fMemoryManager) RefHashTableOf<XMLInteger> ( curArraySize , true , new (fMemoryManager) HashCMStateSet() , fMemoryManager ); //stateTable->put((CMStateSet*)setT, new (fMemoryManager) XMLInteger(0)); // // Ok, almost done with the algorithm from hell... We now enter the // loop where we go until the states done counter catches up with // the states to do counter. // CMStateSet* newSet = 0; while (unmarkedState < curState) { // // Get the next unmarked state out of the list of states to do. // And get the associated transition table entry. // setT = statesToDo[unmarkedState]; unsigned int* transEntry = fTransTable[unmarkedState]; // Mark this one final if it contains the EOC state fFinalStateFlags[unmarkedState] = setT->getBit(fEOCPos); // Bump up the unmarked state count, marking this state done unmarkedState++; // Optimization(Jan, 2001) unsigned int sorterIndex = 0; // Optimization(Jan, 2001) // Loop through each possible input symbol in the element map for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) { // // Build up a set of states which is the union of all of the // follow sets of DFA positions that are in the current state. If // we gave away the new set last time through then create a new // one. Otherwise, zero out the existing one. // if (!newSet) newSet = new (fMemoryManager) CMStateSet ( fLeafCount , fMemoryManager ); else newSet->zeroBits(); #ifdef OBSOLETED // unoptimized code for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) { // If this leaf index (DFA position) is in the current set... if (setT->getBit(leafIndex)) { // // If this leaf is the current input symbol, then we want // to add its follow list to the set of states to transition // to from the current state. // const QName* leaf = fLeafList[leafIndex]->getElement(); const QName* element = fElemMap[elemIndex]; if (fDTD) { if (XMLString::equals(leaf->getRawName(), element->getRawName())) { *newSet |= *fFollowList[leafIndex]; } } else { if ((leaf->getURI() == element->getURI()) && (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) { *newSet |= *fFollowList[leafIndex]; } } } } // for leafIndex #endif // Optimization(Jan, 2001) int leafIndex = fLeafSorter[sorterIndex++]; while (leafIndex != -1) { // If this leaf index (DFA position) is in the current set... if (setT->getBit(leafIndex)) { // // If this leaf is the current input symbol, then we // want to add its follow list to the set of states to // transition to from the current state. // *newSet |= *fFollowList[leafIndex]; } leafIndex = fLeafSorter[sorterIndex++]; } // while (leafIndex != -1) // // If this new set is not empty, then see if its in the list // of states to do. If not, then add it. // if (!newSet->isEmpty()) { // // Search the 'states to do' list to see if this new // state set is already in there. // /*** unsigned int stateIndex = 0; for (; stateIndex < curState; stateIndex++) { if (*statesToDo[stateIndex] == *newSet) break; } ***/ XMLInteger *stateObj = (XMLInteger*) (stateTable->get(newSet)); unsigned int stateIndex = (stateObj == 0 ? curState : stateObj->intValue()); // If we did not find it, then add it if (stateIndex == curState) { // // Put this new state into the states to do and init // a new entry at the same index in the transition // table. // statesToDo[curState] = newSet; fTransTable[curState] = makeDefStateList(); stateTable->put ( newSet , new (fMemoryManager) XMLInteger(curState) ); // We now have a new state to do so bump the count curState++; // // Null out the new set to indicate we adopted it. This // will cause the creation of a new set on the next time // around the loop. // newSet = 0; } // // Now set this state in the transition table's entry for this // element (using its index), with the DFA state we will move // to from the current state when we see this input element. // transEntry[elemIndex] = stateIndex; // Expand the arrays if we're full if (curState == curArraySize) { // // Yikes, we overflowed the initial array size, so we've // got to expand all of these arrays. So adjust up the // size by 50% and allocate new arrays. // const unsigned int newSize = (unsigned int)(curArraySize * 1.5); const CMStateSet** newToDo = (const CMStateSet**) fMemoryManager->allocate ( newSize * sizeof(const CMStateSet*) ); //new const CMStateSet*[newSize]; bool* newFinalFlags = (bool*) fMemoryManager->allocate ( newSize * sizeof(bool) ); //new bool[newSize]; unsigned int** newTransTable = (unsigned int**) fMemoryManager->allocate ( newSize * sizeof(unsigned int*) ); //new unsigned int*[newSize]; // Copy over all of the existing content for (unsigned int expIndex = 0; expIndex < curArraySize; expIndex++) { newToDo[expIndex] = statesToDo[expIndex]; newFinalFlags[expIndex] = fFinalStateFlags[expIndex]; newTransTable[expIndex] = fTransTable[expIndex]; } // Clean up the old stuff fMemoryManager->deallocate(statesToDo); //delete [] statesToDo; fMemoryManager->deallocate(fFinalStateFlags); //delete [] fFinalStateFlags; fMemoryManager->deallocate(fTransTable); //delete [] fTransTable; // Store the new array size and pointers curArraySize = newSize; statesToDo = newToDo; fFinalStateFlags = newFinalFlags; fTransTable = newTransTable; } //if (curState == curArraySize) } //if (!newSet->isEmpty()) } // for elemIndex } //while // Store the current state count in the trans table size fTransTableSize = curState; // If the last temp set was not stored, then clean it up if (newSet) delete newSet; // // Now we can clean up all of the temporary data that was needed during // DFA build. // // // Note on memory leak: Bugzilla#2707: // =================================== // The CMBinary, pointed to by fHeadNode, shall be released by // deleted by itself. // // Change has been made to postTreeBuildInit() such that fLeafList[] // would maintain its **OWN** copy of CMLeaf to avoid double deletion // of CMLeaf. // delete fHeadNode; for (index = 0; index < fLeafCount; index++) delete fFollowList[index]; fMemoryManager->deallocate(fFollowList); //delete [] fFollowList; // // removeAll() will delete all data, XMLInteger, // while the keys are to be deleted by the // deletion of statesToDo. // delete stateTable; for (index = 0; index < curState; index++) delete (CMStateSet*)statesToDo[index]; fMemoryManager->deallocate(statesToDo); //delete [] statesToDo; for (index = 0; index < fLeafCount; index++) delete fLeafList[index]; fMemoryManager->deallocate(fLeafList); //delete [] fLeafList; fMemoryManager->deallocate(fLeafSorter); //delete [] fLeafSorter; }