//------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // //------------------------------------------------------------------------ void RBBISetBuilder::build() { RBBINode *usetNode; RangeDescriptor *rlRange; if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} // // Initialize the process by creating a single range encompassing all characters // that is in no sets. // fRangeList = new RangeDescriptor(*fStatus); // will check for status here fRangeList->fStartChar = 0; fRangeList->fEndChar = 0x10ffff; if (U_FAILURE(*fStatus)) { return; } // // Find the set of non-overlapping ranges of characters // int ni; for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; int32_t inputSetRangeCount = inputSet->getRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange->fEndChar < inputSetRangeBegin) { rlRange = rlRange->fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range list range // in two, with one part being before (wholly outside of) the unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will then be skipped // over if (rlRange->fStartChar < inputSetRangeBegin) { rlRange->split(inputSetRangeBegin, *fStatus); if (U_FAILURE(*fStatus)) { return; } continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide with // the end of the range from the unicode set, split the range list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange->fEndChar > inputSetRangeEnd) { rlRange->split(inputSetRangeEnd+1, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // The current rlRange is now entirely within the UnicodeSet range. // Add this unicode set to the list of sets for this rlRange if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { rlRange->fIncludesSets->addElement(usetNode, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange->fEndChar) { inputSetRangeIndex++; } rlRange = rlRange->fNext; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-in-input // # 3 is the first range list. // RangeDescriptor *rlSearchRange; for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { rlRange->fNum = rlSearchRange->fNum; break; } } if (rlRange->fNum == 0) { fGroupCount ++; rlRange->fNum = fGroupCount+2; rlRange->setDictionaryFlag(); addValToSets(rlRange->fIncludesSets, fGroupCount+2); } } // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not a characters in the normal sense, // they doesn't affect the computation of ranges or TRIE. static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; UnicodeString eofString(eofUString); UnicodeString bofString(bofUString); for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; if (inputSet->contains(eofString)) { addValToSet(usetNode, 1); } if (inputSet->contains(bofString)) { addValToSet(usetNode, 2); fSawBOF = TRUE; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} // // Build the Trie table for mapping UChar32 values to the corresponding // range group number // fTrie = utrie_open(NULL, // Pre-existing trie to be filled in NULL, // Data array (utrie will allocate one) 100000, // Max Data Length 0, // Initial value for all code points 0, // Lead surrogate unit value TRUE); // Keep Latin 1 in separately for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); } }
//------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // //------------------------------------------------------------------------ void RBBISetBuilder::build() { RBBINode *usetNode; RangeDescriptor *rlRange; if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) { printSets(); } // // Initialize the process by creating a single range encompassing all characters // that is in no sets. // fRangeList = new RangeDescriptor(*fStatus); // will check for status here fRangeList->fStartChar = 0; fRangeList->fEndChar = 0x10ffff; if (U_FAILURE(*fStatus)) { return; } // // Find the set of non-overlapping ranges of characters // int ni; for (ni=0; ; ni++) { usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; int32_t inputSetRangeCount = inputSet->getRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange->fEndChar < inputSetRangeBegin) { rlRange = rlRange->fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range list range // in two, with one part being before (wholly outside of) the unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will then be skipped // over if (rlRange->fStartChar < inputSetRangeBegin) { rlRange->split(inputSetRangeBegin, *fStatus); if (U_FAILURE(*fStatus)) { return; } continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide with // the end of the range from the unicode set, split the range list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange->fEndChar > inputSetRangeEnd) { rlRange->split(inputSetRangeEnd+1, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // The current rlRange is now entirely within the UnicodeSet range. // Add this unicode set to the list of sets for this rlRange if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { rlRange->fIncludesSets->addElement(usetNode, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange->fEndChar) { inputSetRangeIndex++; } rlRange = rlRange->fNext; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges(); } // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // RangeDescriptor *rlSearchRange; for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { rlRange->fNum = rlSearchRange->fNum; break; } } if (rlRange->fNum == 0) { fGroupCount ++; rlRange->fNum = fGroupCount; rlRange->setDictionaryFlag(); addValToSets(rlRange->fIncludesSets, fGroupCount); } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) { printRangeGroups(); } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) { printSets(); } // // Build the Trie table for mapping UChar32 values to the corresponding // range group number // fTrie = utrie_open(NULL, // Pre-existing trie to be filled in NULL, // Data array (utrie will allocate one) 100000, // Max Data Length 0, // Initial value for all code points 0, // Lead surrogate unit value TRUE); // Keep Latin 1 in separately for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); } }