//------------------------------------------------------------------------
//
//   build          Build the list of non-overlapping character ranges
//                  from the Unicode Sets.
//
//------------------------------------------------------------------------
void RBBISetBuilder::build() {
    RBBINode        *usetNode;
    RangeDescriptor *rlRange;

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}

    //
    //  Initialize the process by creating a single range encompassing all characters
    //  that is in no sets.
    //
    fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
    fRangeList->fStartChar    = 0;
    fRangeList->fEndChar      = 0x10ffff;

    if (U_FAILURE(*fStatus)) {
        return;
    }

    //
    //  Find the set of non-overlapping ranges of characters
    //
    int  ni;
    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
        if (usetNode==NULL) {
            break;
        }

        UnicodeSet      *inputSet             = usetNode->fInputSet;
        int32_t          inputSetRangeCount   = inputSet->getRangeCount();
        int              inputSetRangeIndex   = 0;
                         rlRange              = fRangeList;

        for (;;) {
            if (inputSetRangeIndex >= inputSetRangeCount) {
                break;
            }
            UChar32      inputSetRangeBegin  = inputSet->getRangeStart(inputSetRangeIndex);
            UChar32      inputSetRangeEnd    = inputSet->getRangeEnd(inputSetRangeIndex);

            // skip over ranges from the range list that are completely
            //   below the current range from the input unicode set.
            while (rlRange->fEndChar < inputSetRangeBegin) {
                rlRange = rlRange->fNext;
            }

            // If the start of the range from the range list is before with
            //   the start of the range from the unicode set, split the range list range
            //   in two, with one part being before (wholly outside of) the unicode set
            //   and the other containing the rest.
            //   Then continue the loop; the post-split current range will then be skipped
            //     over
            if (rlRange->fStartChar < inputSetRangeBegin) {
                rlRange->split(inputSetRangeBegin, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
                continue;
            }

            // Same thing at the end of the ranges...
            // If the end of the range from the range list doesn't coincide with
            //   the end of the range from the unicode set, split the range list
            //   range in two.  The first part of the split range will be
            //   wholly inside the Unicode set.
            if (rlRange->fEndChar > inputSetRangeEnd) {
                rlRange->split(inputSetRangeEnd+1, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
            }

            // The current rlRange is now entirely within the UnicodeSet range.
            // Add this unicode set to the list of sets for this rlRange
            if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
                rlRange->fIncludesSets->addElement(usetNode, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
            }

            // Advance over ranges that we are finished with.
            if (inputSetRangeEnd == rlRange->fEndChar) {
                inputSetRangeIndex++;
            }
            rlRange = rlRange->fNext;
        }
    }

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}

    //
    //  Group the above ranges, with each group consisting of one or more
    //    ranges that are in exactly the same set of original UnicodeSets.
    //    The groups are numbered, and these group numbers are the set of
    //    input symbols recognized by the run-time state machine.
    //
    //    Numbering: # 0  (state table column 0) is unused.
    //               # 1  is reserved - table column 1 is for end-of-input
    //               # 2  is reserved - table column 2 is for beginning-in-input
    //               # 3  is the first range list.
    //
    RangeDescriptor *rlSearchRange;
    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
            if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
                rlRange->fNum = rlSearchRange->fNum;
                break;
            }
        }
        if (rlRange->fNum == 0) {
            fGroupCount ++;
            rlRange->fNum = fGroupCount+2; 
            rlRange->setDictionaryFlag();
            addValToSets(rlRange->fIncludesSets, fGroupCount+2);
        }
    }

    // Handle input sets that contain the special string {eof}.
    //   Column 1 of the state table is reserved for EOF on input.
    //   Column 2 is reserved for before-the-start-input.
    //            (This column can be optimized away later if there are no rule
    //             references to {bof}.)
    //   Add this column value (1 or 2) to the equivalent expression
    //     subtree for each UnicodeSet that contains the string {eof}
    //   Because {bof} and {eof} are not a characters in the normal sense,
    //   they doesn't affect the computation of ranges or TRIE.
    static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
    static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};

    UnicodeString eofString(eofUString);
    UnicodeString bofString(bofUString);
    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
        if (usetNode==NULL) {
            break;
        }
        UnicodeSet      *inputSet = usetNode->fInputSet;
        if (inputSet->contains(eofString)) {
            addValToSet(usetNode, 1);
        }
        if (inputSet->contains(bofString)) {
            addValToSet(usetNode, 2);
            fSawBOF = TRUE;
        }
    }


    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}

    //
    // Build the Trie table for mapping UChar32 values to the corresponding
    //   range group number
    //
    fTrie = utrie_open(NULL,    //  Pre-existing trie to be filled in
                      NULL,    //  Data array  (utrie will allocate one)
                      100000,  //  Max Data Length
                      0,       //  Initial value for all code points
                      0,       //  Lead surrogate unit value
                      TRUE);   //  Keep Latin 1 in separately


    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
    }
}
Example #2
0
//------------------------------------------------------------------------
//
//   build          Build the list of non-overlapping character ranges
//                  from the Unicode Sets.
//
//------------------------------------------------------------------------
void RBBISetBuilder::build() {
    RBBINode        *usetNode;
    RangeDescriptor *rlRange;

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {
        printSets();
    }

    //
    //  Initialize the process by creating a single range encompassing all characters
    //  that is in no sets.
    //
    fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
    fRangeList->fStartChar    = 0;
    fRangeList->fEndChar      = 0x10ffff;

    if (U_FAILURE(*fStatus)) {
        return;
    }

    //
    //  Find the set of non-overlapping ranges of characters
    //
    int  ni;
    for (ni=0; ; ni++) {
        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
        if (usetNode==NULL) {
            break;
        }

        UnicodeSet      *inputSet             = usetNode->fInputSet;
        int32_t          inputSetRangeCount   = inputSet->getRangeCount();
        int              inputSetRangeIndex   = 0;
        rlRange              = fRangeList;

        for (;;) {
            if (inputSetRangeIndex >= inputSetRangeCount) {
                break;
            }
            UChar32      inputSetRangeBegin  = inputSet->getRangeStart(inputSetRangeIndex);
            UChar32      inputSetRangeEnd    = inputSet->getRangeEnd(inputSetRangeIndex);

            // skip over ranges from the range list that are completely
            //   below the current range from the input unicode set.
            while (rlRange->fEndChar < inputSetRangeBegin) {
                rlRange = rlRange->fNext;
            }

            // If the start of the range from the range list is before with
            //   the start of the range from the unicode set, split the range list range
            //   in two, with one part being before (wholly outside of) the unicode set
            //   and the other containing the rest.
            //   Then continue the loop; the post-split current range will then be skipped
            //     over
            if (rlRange->fStartChar < inputSetRangeBegin) {
                rlRange->split(inputSetRangeBegin, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
                continue;
            }

            // Same thing at the end of the ranges...
            // If the end of the range from the range list doesn't coincide with
            //   the end of the range from the unicode set, split the range list
            //   range in two.  The first part of the split range will be
            //   wholly inside the Unicode set.
            if (rlRange->fEndChar > inputSetRangeEnd) {
                rlRange->split(inputSetRangeEnd+1, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
            }

            // The current rlRange is now entirely within the UnicodeSet range.
            // Add this unicode set to the list of sets for this rlRange
            if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
                rlRange->fIncludesSets->addElement(usetNode, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
            }

            // Advance over ranges that we are finished with.
            if (inputSetRangeEnd == rlRange->fEndChar) {
                inputSetRangeIndex++;
            }
            rlRange = rlRange->fNext;
        }
    }

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) {
        printRanges();
    }

    //
    //  Group the above ranges, with each group consisting of one or more
    //    ranges that are in exactly the same set of original UnicodeSets.
    //    The groups are numbered, and these group numbers are the set of
    //    input symbols recognized by the run-time state machine.
    //
    RangeDescriptor *rlSearchRange;
    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
            if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
                rlRange->fNum = rlSearchRange->fNum;
                break;
            }
        }
        if (rlRange->fNum == 0) {
            fGroupCount ++;
            rlRange->fNum = fGroupCount;
            rlRange->setDictionaryFlag();
            addValToSets(rlRange->fIncludesSets, fGroupCount);
        }
    }

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {
        printRangeGroups();
    }
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {
        printSets();
    }

    //
    // Build the Trie table for mapping UChar32 values to the corresponding
    //   range group number
    //
    fTrie = utrie_open(NULL,    //  Pre-existing trie to be filled in
                       NULL,    //  Data array  (utrie will allocate one)
                       100000,  //  Max Data Length
                       0,       //  Initial value for all code points
                       0,       //  Lead surrogate unit value
                       TRUE);   //  Keep Latin 1 in separately


    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
    }
}