void RBBISetBuilder::printSets() {
    int                   i;

    RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
    for (i=0; ; i++) {
        RBBINode        *usetNode;
        RBBINode        *setRef;
        RBBINode        *varRef;
        UnicodeString    setName;

        usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
        if (usetNode == NULL) {
            break;
        }

        RBBIDebugPrintf("%3d    ", i);
        setName = UNICODE_STRING("anonymous", 9);
        setRef = usetNode->fParent;
        if (setRef != NULL) {
            varRef = setRef->fParent;
            if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
                setName = varRef->fText;
            }
        }
        RBBI_DEBUG_printUnicodeString(setName);
        RBBIDebugPrintf("   ");
        RBBI_DEBUG_printUnicodeString(usetNode->fText);
        RBBIDebugPrintf("\n");
        if (usetNode->fLeftChild != NULL) {
            usetNode->fLeftChild->printTree(TRUE);
        }
    }
    RBBIDebugPrintf("\n");
}
Esempio n. 2
0
void RBBINode::printNode() {
    static const char * const nodeTypeNames[] = {
                "setRef",
                "uset",
                "varRef",
                "leafChar",
                "lookAhead",
                "tag",
                "endMark",
                "opStart",
                "opCat",
                "opOr",
                "opStar",
                "opPlus",
                "opQuestion",
                "opBreak",
                "opReverse",
                "opLParen"
    };

    if (this==NULL) {
        RBBIDebugPrintf("%10p", (void *)this);
    } else {
        RBBIDebugPrintf("%10p  %12s  %10p  %10p  %10p      %4d     %6d   %d ",
            (void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild,
            fSerialNum, fFirstPos, fVal);
        if (fType == varRef) {
            RBBI_DEBUG_printUnicodeString(fText);
        }
    }
    RBBIDebugPrintf("\n");
}
Esempio n. 3
0
void RBBISetBuilder::printRanges()
{
	RangeDescriptor    *   rlRange;
	int                    i;

	RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
	for (rlRange = fRangeList; rlRange != 0; rlRange = rlRange->fNext)
	{
		RBBIDebugPrintf("%2i  %4x-%4x  ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);

		for (i = 0; i < rlRange->fIncludesSets->size(); i++)
		{
			RBBINode    *   usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
			UnicodeString   setName = UNICODE_STRING("anon", 4);
			RBBINode    *   setRef = usetNode->fParent;
			if (setRef != NULL)
			{
				RBBINode * varRef = setRef->fParent;
				if (varRef != NULL  &&  varRef->fType == RBBINode::varRef)
				{
					setName = varRef->fText;
				}
			}
			RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf("  ");
		}
		RBBIDebugPrintf("\n");
	}
}
Esempio n. 4
0
void RBBISymbolTable::rbbiSymtablePrint() const {
    RBBIDebugPrintf("Variable Definitions\n"
           "Name               Node Val     String Val\n"
           "----------------------------------------------------------------------\n");

    int32_t pos = -1;
    const UHashElement  *e   = NULL;
    for (;;) {
        e = uhash_nextElement(fHashTable,  &pos);
        if (e == NULL ) {
            break;
        }
        RBBISymbolTableEntry  *s   = (RBBISymbolTableEntry *)e->value.pointer;

        RBBI_DEBUG_printUnicodeString(s->key, 15);
        RBBIDebugPrintf("   %8p   ", (void *)s->val);
        RBBI_DEBUG_printUnicodeString(s->val->fLeftChild->fText);
        RBBIDebugPrintf("\n");
    }

    RBBIDebugPrintf("\nParsed Variable Definitions\n");
    pos = -1;
    for (;;) {
        e = uhash_nextElement(fHashTable,  &pos);
        if (e == NULL ) {
            break;
        }
        RBBISymbolTableEntry  *s   = (RBBISymbolTableEntry *)e->value.pointer;
        RBBI_DEBUG_printUnicodeString(s->key);
        s->val->fLeftChild->printTree(TRUE);
        RBBIDebugPrintf("\n");
    }
}
Esempio n. 5
0
void RBBITableBuilder::printSet(UVector *s) {
    int32_t  i;
    for (i=0; i<s->size(); i++) {
        const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
        RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
    }
    RBBIDebugPrintf("\n");
}
Esempio n. 6
0
void RBBITableBuilder::printSet(UVector *s) {
    int32_t  i;
    for (i=0; i<s->size(); i++) {
        void *v = s->elementAt(i);
        RBBIDebugPrintf("%10p", v);
    }
    RBBIDebugPrintf("\n");
}
Esempio n. 7
0
U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth)
{
    int i;
    for (i=0; i<s.length(); i++) {
        RBBIDebugPrintf("%c", s.charAt(i));
        // putc(s.charAt(i), stdout);
    }
    for (i=s.length(); i<minWidth; i++) {
        RBBIDebugPrintf(" ");
    }
}
Esempio n. 8
0
void RBBIRuleScanner::printNodeStack(const char *title) {
    int i;
    RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
    for (i=fNodeStackPtr; i>0; i--) {
        fNodeStack[i]->printTree(TRUE);
    }
}
Esempio n. 9
0
void RBBITableBuilder::printPosSets(RBBINode *n) {
    if (n==NULL) {
        return;
    }
    n->printNode();
    RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");

    RBBIDebugPrintf("         firstpos:  ");
    printSet(n->fFirstPosSet);

    RBBIDebugPrintf("         lastpos:   ");
    printSet(n->fLastPosSet);

    RBBIDebugPrintf("         followpos: ");
    printSet(n->fFollowPos);

    printPosSets(n->fLeftChild);
    printPosSets(n->fRightChild);
}
Esempio n. 10
0
void RBBISetBuilder::printRangeGroups()
{
	RangeDescriptor    *   rlRange;
	RangeDescriptor    *   tRange;
	int                    i;
	int                    lastPrintedGroupNum = 0;

	RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
	for (rlRange = fRangeList; rlRange != 0; rlRange = rlRange->fNext)
	{
		int groupNum = rlRange->fNum & 0xbfff;
		if (groupNum > lastPrintedGroupNum)
		{
			lastPrintedGroupNum = groupNum;
			RBBIDebugPrintf("%2i  ", groupNum);

			if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}

			for (i = 0; i < rlRange->fIncludesSets->size(); i++)
			{
				RBBINode    *   usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
				UnicodeString   setName = UNICODE_STRING("anon", 4);
				RBBINode    *   setRef = usetNode->fParent;
				if (setRef != NULL)
				{
					RBBINode * varRef = setRef->fParent;
					if (varRef != NULL  &&  varRef->fType == RBBINode::varRef)
					{
						setName = varRef->fText;
					}
				}
				RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
			}

			i = 0;
			for (tRange = rlRange; tRange != 0; tRange = tRange->fNext)
			{
				if (tRange->fNum == rlRange->fNum)
				{
					if (i++ % 5 == 0)
					{
						RBBIDebugPrintf("\n    ");
					}
					RBBIDebugPrintf("  %05x-%05x", tRange->fStartChar, tRange->fEndChar);
				}
			}
			RBBIDebugPrintf("\n");
		}
	}
	RBBIDebugPrintf("\n");
}
Esempio n. 11
0
void RBBISymbolTable::rbbiSymtablePrint() const {
    RBBIDebugPrintf("Variable Definitions Symbol Table\n"
           "Name                  Node         serial  String Val\n"
           "-------------------------------------------------------------------\n");

    int32_t pos = UHASH_FIRST;
    const UHashElement  *e   = NULL;
    for (;;) {
        e = uhash_nextElement(fHashTable,  &pos);
        if (e == NULL ) {
            break;
        }
        RBBISymbolTableEntry  *s   = (RBBISymbolTableEntry *)e->value.pointer;

        RBBIDebugPrintf("%-19s   %8p %7d ", CStr(s->key)(), (void *)s->val, s->val->fSerialNum);
        RBBIDebugPrintf(" %s\n", CStr(s->val->fLeftChild->fText)());
    }

    RBBIDebugPrintf("\nParsed Variable Definitions\n");
    pos = -1;
    for (;;) {
        e = uhash_nextElement(fHashTable,  &pos);
        if (e == NULL ) {
            break;
        }
        RBBISymbolTableEntry  *s   = (RBBISymbolTableEntry *)e->value.pointer;
        RBBIDebugPrintf("%s\n", CStr(s->key)());
        RBBINode::printTree(s->val, TRUE);
        RBBINode::printTree(s->val->fLeftChild, FALSE);
        RBBIDebugPrintf("\n");
    }
}
Esempio n. 12
0
//---------------------------------------------------------------------------------
//
//  pushNewNode   create a new RBBINode of the specified type and push it
//                onto the stack of nodes.
//
//---------------------------------------------------------------------------------
RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) {
    fNodeStackPtr++;
    if (fNodeStackPtr >= kStackSize) {
        error(U_BRK_INTERNAL_ERROR);
        RBBIDebugPrintf("RBBIRuleScanner::pushNewNode - stack overflow.\n");
        *fRB->fStatus = U_BRK_INTERNAL_ERROR;
        return NULL;
    }
    fNodeStack[fNodeStackPtr] = new RBBINode(t);
    if (fNodeStack[fNodeStackPtr] == NULL) {
        *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
    }
    return fNodeStack[fNodeStackPtr];
}
Esempio n. 13
0
void  RBBIDataWrapper::printData() {
    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
    RBBIDebugPrintf("   Version = %d\n", fHeader->fVersion);
    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);

    printTable("Forward State Transition Table", fForwardTable);
    printTable("Reverse State Transition Table", fReverseTable);
    printTable("Safe Forward State Transition Table", fSafeFwdTable);
    printTable("Safe Reverse State Transition Table", fSafeRevTable);

    RBBIDebugPrintf("\nOrignal Rules source:\n");
    for (int32_t c=0; fRuleSource[c] != 0; c++) {
        RBBIDebugPrintf("%c", fRuleSource[c]);
    }
    RBBIDebugPrintf("\n\n");
}
Esempio n. 14
0
//----------------------------------------------------------------------------------------
//
//  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
//               An entry on the stack may be as small as a single setRef node,
//               or as large as the parse tree
//               for an entire expression (this will be the one item left on the stack
//               when the parsing of an RBBI rule completes.
//
//               This function is called when a binary operator is encountered.
//               It looks back up the stack for operators that are not yet associated
//               with a right operand, and if the precedence of the stacked operator >=
//               the precedence of the current operator, binds the operand left,
//               to the previously encountered operator.
//
//----------------------------------------------------------------------------------------
void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
    RBBINode *n;
    // printNodeStack("entering fixOpStack()");
    for (;;) {
        n = fNodeStack[fNodeStackPtr-1];   // an operator node
        if (n->fPrecedence == 0) {
            RBBIDebugPrintf("RBBIRuleScanner::fixOpStack, bad operator node\n");
            error(U_BRK_INTERNAL_ERROR);
            return;
        }

        if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
            // The most recent operand goes with the current operator,
            //   not with the previously stacked one.
            break;
        }
            // Stack operator is a binary op  ( '|' or concatenation)
            //   TOS operand becomes right child of this operator.
            //   Resulting subexpression becomes the TOS operand.
            n->fRightChild = fNodeStack[fNodeStackPtr];
            fNodeStack[fNodeStackPtr]->fParent = n;
            fNodeStackPtr--;
        // printNodeStack("looping in fixOpStack()   ");
    }

    if (p <= RBBINode::precLParen) {
        // Scan is at a right paren or end of expression.
        //  The scanned item must match the stack, or else there was an error.
        //  Discard the left paren (or start expr) node from the stack,
            //  leaving the completed (sub)expression as TOS.
            if (n->fPrecedence != p) {
                // Right paren encountered matched start of expression node, or
                // end of expression matched with a left paren node.
                error(U_BRK_MISMATCHED_PAREN);
            }
            fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
            fNodeStackPtr--;
            // Delete the now-discarded LParen or Start node.
            delete n;
    }
    // printNodeStack("leaving fixOpStack()");
}
Esempio n. 15
0
void  RBBIDataWrapper::printData() {
#ifdef RBBI_DEBUG
    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
    RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
                                                    fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);

    printTable("Forward State Transition Table", fForwardTable);
    printTable("Reverse State Transition Table", fReverseTable);

    RBBIDebugPrintf("\nOrignal Rules source:\n");
    for (int32_t c=0; fRuleSource[c] != 0; c++) {
        RBBIDebugPrintf("%c", fRuleSource[c]);
    }
    RBBIDebugPrintf("\n\n");
#endif
}
Esempio n. 16
0
void RBBINode::printTree(UBool printHeading) {
    if (printHeading) {
        RBBIDebugPrintf( "-------------------------------------------------------------------\n"
                         "    Address       type         Parent   LeftChild  RightChild    serial  position value\n"
              );
    }
    this->printNode();
    if (this != NULL) {
        // Only dump the definition under a variable reference if asked to.
        // Unconditinally dump children of all other node types.
        if (fType != varRef) {
            if (fLeftChild != NULL) {
                fLeftChild->printTree(FALSE);
            }
            
            if (fRightChild != NULL) {
                fRightChild->printTree(FALSE);
            }
        }
    }
}
Esempio n. 17
0
void RBBITableBuilder::printRuleStatusTable() {
    int32_t  thisRecord = 0;
    int32_t  nextRecord = 0;
    int      i;
    UVector  *tbl = fRB->fRuleStatusVals;

    RBBIDebugPrintf("index |  tags \n");
    RBBIDebugPrintf("-------------------\n");

    while (nextRecord < tbl->size()) {
        thisRecord = nextRecord;
        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
        RBBIDebugPrintf("%4d   ", thisRecord);
        for (i=thisRecord+1; i<nextRecord; i++) {
            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
        }
        RBBIDebugPrintf("\n");
    }
    RBBIDebugPrintf("\n\n");
}
Esempio n. 18
0
//-----------------------------------------------------------------------------------
//
//  handlePrevious()
//
//      Iterate backwards, according to the logic of the reverse rules.
//      This version handles the exact style backwards rules.
//
//      The logic of this function is very similar to handleNext(), above.
//
//-----------------------------------------------------------------------------------
int32_t BreakIterator::handlePrevious(const RBBIStateTable *statetable) {
    int32_t             state;
    int16_t             category        = 0;
    RBBIRunMode         mode;
    RBBIStateTableRow  *row;
    UChar32             c;
    int32_t             lookaheadStatus = 0;
    int32_t             result          = 0;
    int32_t             initialPosition = 0;
    int32_t             lookaheadResult = 0;
    UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;

    #ifdef RBBI_DEBUG
        if (fTrace) {
            RBBIDebugPuts("Handle Previous   pos   char  state category");
        }
    #endif

    // handlePrevious() never gets the rule status.
    // Flag the status as invalid; if the user ever asks for status, we will need
    // to back up, then re-find the break position using handleNext(), which does
    // get the status value.
    fLastStatusIndexValid = FALSE;
    fLastRuleStatusIndex = 0;

    // if we're already at the start of the text, return DONE.
    if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
        return BreakIterator::DONE;
    }

    //  Set up the starting char.
    initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    result          = initialPosition;
    c               = UTEXT_PREVIOUS32(fText);

    //  Set the initial state for the state machine
    state = START_STATE;
    row = (RBBIStateTableRow *)
            (statetable->fTableData + (statetable->fRowLen * state));
    category = 3;
    mode     = RBBI_RUN;
    if (statetable->fFlags & RBBI_BOF_REQUIRED) {
        category = 2;
        mode     = RBBI_START;
    }


    // loop until we reach the start of the text or transition to state 0
    //
    for (;;) {
        if (c == U_SENTINEL) {
            // Reached end of input string.
            if (mode == RBBI_END || 
                *(int32_t *)fHeader->fFormatVersion == 1 ) {
                // We have already run the loop one last time with the 
                //   character set to the psueudo {eof} value.  Now it is time
                //   to unconditionally bail out.
                //  (Or we have an old format binary rule file that does not support {eof}.)
                if (lookaheadResult < result) {
                    // We ran off the end of the string with a pending look-ahead match.
                    // Treat this as if the look-ahead condition had been met, and return
                    //  the match at the / position from the look-ahead rule.
                    result               = lookaheadResult;
                    lookaheadStatus = 0;
                } else if (result == initialPosition) {
                    // Ran off start, no match found.
                    // move one index one (towards the start, since we are doing a previous())
                    UTEXT_SETNATIVEINDEX(fText, initialPosition);
                    UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
                }
                break;
            }
            // Run the loop one last time with the fake end-of-input character category.
            mode = RBBI_END;
            category = 1;
        }

        //
        // Get the char category.  An incoming category of 1 or 2 means that
        //      we are preset for doing the beginning or end of input, and
        //      that we shouldn't get a category from an actual text input character.
        //
        if (mode == RBBI_RUN) {
            // look up the current character's character category, which tells us
            // which column in the state table to look at.
            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
            //        not the size of the character going in, which is a UChar32.
            //
            UTRIE_GET16(&fTrie, c, category);

            // Check the dictionary bit in the character's category.
            //    Counter is only used by dictionary based iterators (subclasses).
            //    Chars that need to be handled by a dictionary have a flag bit set
            //    in their category values.
            //
            if ((category & 0x4000) != 0)  {
                fDictionaryCharCount++;
                //  And off the dictionary flag bit.
                category &= ~0x4000;
            }
        }

        #ifdef RBBI_DEBUG
            if (fTrace) {
                RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
                if (0x20<=c && c<0x7f) {
                    RBBIDebugPrintf("\"%c\"  ", c);
                } else {
                    RBBIDebugPrintf("%5x  ", c);
                }
                RBBIDebugPrintf("%3d  %3d\n", state, category);
            }
        #endif

        // State Transition - move machine to its next state
        //
        state = row->fNextState[category];
        row = (RBBIStateTableRow *)
            (statetable->fTableData + (statetable->fRowLen * state));

        if (row->fAccepting == -1) {
            // Match found, common case.
            result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
        }

        if (row->fLookAhead != 0) {
            if (lookaheadStatus != 0
                && row->fAccepting == lookaheadStatus) {
                // Lookahead match is completed.  
                result               = lookaheadResult;
                lookaheadStatus      = 0;
                // TODO:  make a standalone hard break in a rule work.
                if (lookAheadHardBreak) {
                    UTEXT_SETNATIVEINDEX(fText, result);
                    return result;
                }
                // Look-ahead completed, but other rules may match further.  Continue on
                //  TODO:  junk this feature?  I don't think it's used anywhwere.
                goto continueOn;
            }

            int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
            lookaheadResult = r;
            lookaheadStatus = row->fLookAhead;
            goto continueOn;
        }


        if (row->fAccepting != 0) {
            // Because this is an accepting state, any in-progress look-ahead match
            //   is no longer relavant.  Clear out the pending lookahead status.
            lookaheadStatus = 0;    
        }

continueOn:
        if (state == STOP_STATE) {
            // This is the normal exit from the lookup state machine.
            // We have advanced through the string until it is certain that no
            //   longer match is possible, no matter what characters follow.
            break;
        }

        // Move (backwards) to the next character to process.  
        // If this is a beginning-of-input loop iteration, don't advance
        //    the input position.  The next iteration will be processing the
        //    first real input character.
        if (mode == RBBI_RUN) {
            c = UTEXT_PREVIOUS32(fText);
        } else {            
            if (mode == RBBI_START) {
                mode = RBBI_RUN;
            }
        }
    }

    // The state machine is done.  Check whether it found a match...

    // If the iterator failed to advance in the match engine, force it ahead by one.
    //   (This really indicates a defect in the break rules.  They should always match
    //    at least one character.)
    if (result == initialPosition) {
        UTEXT_SETNATIVEINDEX(fText, initialPosition);
        UTEXT_PREVIOUS32(fText);
        result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    }

    // Leave the iterator at our result position.
    UTEXT_SETNATIVEINDEX(fText, result);
    #ifdef RBBI_DEBUG
        if (fTrace) {
            RBBIDebugPrintf("result = %d\n\n", result);
        }
    #endif
    return result;
}
Esempio n. 19
0
//---------------------------------------------------------------------------------
//
//  Parse RBBI rules.   The state machine for rules parsing is here.
//                      The state tables are hand-written in the file rbbirpt.txt,
//                      and converted to the form used here by a perl
//                      script rbbicst.pl
//
//---------------------------------------------------------------------------------
void RBBIRuleScanner::parse() {
    uint16_t                state;
    const RBBIRuleTableEl  *tableEl;

    if (U_FAILURE(*fRB->fStatus)) {
        return;
    }

    state = 1;
    nextChar(fC);
    //
    // Main loop for the rule parsing state machine.
    //   Runs once per state transition.
    //   Each time through optionally performs, depending on the state table,
    //      - an advance to the the next input char
    //      - an action to be performed.
    //      - pushing or popping a state to/from the local state return stack.
    //
    for (;;) {
        //  Bail out if anything has gone wrong.
        //  RBBI rule file parsing stops on the first error encountered.
        if (U_FAILURE(*fRB->fStatus)) {
            break;
        }

        // Quit if state == 0.  This is the normal way to exit the state machine.
        //
        if (state == 0) {
            break;
        }

        // Find the state table element that matches the input char from the rule, or the
        //    class of the input character.  Start with the first table row for this
        //    state, then linearly scan forward until we find a row that matches the
        //    character.  The last row for each state always matches all characters, so
        //    the search will stop there, if not before.
        //
        tableEl = &gRuleParseStateTable[state];
#ifdef RBBI_DEBUG
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
            RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
                            fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
        }
#endif

        for (;;) {
#ifdef RBBI_DEBUG
            if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
                RBBIDebugPrintf(".");
            }
#endif
            if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) {
                // Table row specified an individual character, not a set, and
                //   the input character is not escaped, and
                //   the input character matched it.
                break;
            }
            if (tableEl->fCharClass == 255) {
                // Table row specified default, match anything character class.
                break;
            }
            if (tableEl->fCharClass == 254 && fC.fEscaped)  {
                // Table row specified "escaped" and the char was escaped.
                break;
            }
            if (tableEl->fCharClass == 253 && fC.fEscaped &&
                    (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
                // Table row specified "escaped P" and the char is either 'p' or 'P'.
                break;
            }
            if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {
                // Table row specified eof and we hit eof on the input.
                break;
            }

            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
                    fC.fEscaped == FALSE &&                                      //   char is not escaped &&
                    fC.fChar != (UChar32)-1) {                                   //   char is not EOF
                UnicodeSet *uniset = fRuleSets[tableEl->fCharClass-128];
                if (uniset->contains(fC.fChar)) {
                    // Table row specified a character class, or set of characters,
                    //   and the current char matches it.
                    break;
                }
            }

            // No match on this row, advance to the next  row for this state,
            tableEl++;
        }
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
            RBBIDebugPuts("");
        }

        //
        // We've found the row of the state table that matches the current input
        //   character from the rules string.
        // Perform any action specified  by this row in the state table.
        if (doParseActions((EParseAction)tableEl->fAction) == FALSE) {
            // Break out of the state machine loop if the
            //   the action signalled some kind of error, or
            //   the action was to exit, occurs on normal end-of-rules-input.
            break;
        }

        if (tableEl->fPushState != 0) {
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
                error(U_BRK_INTERNAL_ERROR);
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
                fStackPtr--;
            }
            fStack[fStackPtr] = tableEl->fPushState;
        }

        if (tableEl->fNextChar) {
            nextChar(fC);
        }

        // Get the next state from the table entry, or from the
        //   state stack if the next state was specified as "pop".
        if (tableEl->fNextState != 255) {
            state = tableEl->fNextState;
        } else {
            state = fStack[fStackPtr];
            fStackPtr--;
            if (fStackPtr < 0) {
                error(U_BRK_INTERNAL_ERROR);
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
                fStackPtr++;
            }
        }

    }

    //
    // If there were NO user specified reverse rules, set up the equivalent of ".*;"
    //
    if (fRB->fReverseTree == NULL) {
        fRB->fReverseTree  = pushNewNode(RBBINode::opStar);
        RBBINode  *operand = pushNewNode(RBBINode::setRef);
        findSetFor(kAny, operand);
        fRB->fReverseTree->fLeftChild = operand;
        operand->fParent              = fRB->fReverseTree;
        fNodeStackPtr -= 2;
    }


    //
    // Parsing of the input RBBI rules is complete.
    // We now have a parse tree for the rule expressions
    // and a list of all UnicodeSets that are referenced.
    //
#ifdef RBBI_DEBUG
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {
        fSymbolTable->rbbiSymtablePrint();
    }
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
    {
        RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
        fRB->fForwardTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
        fRB->fReverseTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
        fRB->fSafeFwdTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
        fRB->fSafeRevTree->printTree(TRUE);
    }
#endif
}
Esempio n. 20
0
//---------------------------------------------------------------------------------
//
//  scanSet    Construct a UnicodeSet from the text at the current scan
//             position.  Advance the scan position to the first character
//             after the set.
//
//             A new RBBI setref node referring to the set is pushed onto the node
//             stack.
//
//             The scan position is normally under the control of the state machine
//             that controls rule parsing.  UnicodeSets, however, are parsed by
//             the UnicodeSet constructor, not by the RBBI rule parser.
//
//---------------------------------------------------------------------------------
void RBBIRuleScanner::scanSet() {
    UnicodeSet    *uset;
    ParsePosition  pos;
    int            startPos;
    int            i;

    if (U_FAILURE(*fRB->fStatus)) {
        return;
    }

    pos.setIndex(fScanIndex);
    startPos = fScanIndex;
    UErrorCode localStatus = U_ZERO_ERROR;
    uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE,
                          fSymbolTable,
                          localStatus);
    if (U_FAILURE(localStatus)) {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
#ifdef RBBI_DEBUG
        RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
#endif
        error(localStatus);
        delete uset;
        return;
    }

    // Verify that the set contains at least one code point.
    //
    if (uset->isEmpty()) {
        // This set is empty.
        //  Make it an error, because it almost certainly is not what the user wanted.
        //  Also, avoids having to think about corner cases in the tree manipulation code
        //   that occurs later on.
        error(U_BRK_RULE_EMPTY_SET);
        delete uset;
        return;
    }


    // Advance the RBBI parse postion over the UnicodeSet pattern.
    //   Don't just set fScanIndex because the line/char positions maintained
    //   for error reporting would be thrown off.
    i = pos.getIndex();
    for (;;) {
        if (fNextIndex >= i) {
            break;
        }
        nextCharLL();
    }

    if (U_SUCCESS(*fRB->fStatus)) {
        RBBINode         *n;

        n = pushNewNode(RBBINode::setRef);
        n->fFirstPos = startPos;
        n->fLastPos  = fNextIndex;
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
        //  findSetFor() serves several purposes here:
        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
        //     - Mantains collection of all sets in use, needed later for establishing
        //          character categories for run time engine.
        //     - Eliminates mulitiple instances of the same set.
        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
        findSetFor(n->fText, n, uset);
    }

}
Esempio n. 21
0
//-----------------------------------------------------------------------------
//
//  print   -  debugging function to dump the runtime data tables.
//
//-----------------------------------------------------------------------------
void  RBBIDataWrapper::printData() {
#ifdef RBBI_DEBUG
    uint32_t c, s;

    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
    RBBIDebugPrintf("   Version = %d\n", fHeader->fVersion);
    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);

    RBBIDebugPrintf("   Forward State Transition Table\n");
    RBBIDebugPrintf("State |  Acc  LA   Tag");
    for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
    RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {RBBIDebugPrintf("----");}
    RBBIDebugPrintf("\n");

    for (s=0; s<fForwardTable->fNumStates; s++) {
        RBBIStateTableRow *row = (RBBIStateTableRow *)
                                  (fForwardTable->fTableData + (fForwardTable->fRowLen * s));
        RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTag);
        for (c=0; c<fHeader->fCatCount; c++)  {
            RBBIDebugPrintf("%3d ", row->fNextState[c]);
        }
        RBBIDebugPrintf("\n");
    }

    RBBIDebugPrintf("\nOrignal Rules source:\n");
    c = 0;
    for (;;) {
        if (fRuleSource[c] == 0)
            break;
        RBBIDebugPrintf("%c", fRuleSource[c]);
        c++;
    }
    RBBIDebugPrintf("\n\n");
#endif
}
Esempio n. 22
0
//-----------------------------------------------------------------------------
//
//   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
//                               table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
void  RBBITableBuilder::build() {

    if (U_FAILURE(*fStatus)) {
        return;
    }

    // If there were no rules, just return.  This situation can easily arise
    //   for the reverse rules.
    if (fTree==NULL) {
        return;
    }

    //
    // Walk through the tree, replacing any references to $variables with a copy of the
    //   parse tree for the substition expression.
    //
    fTree = fTree->flattenVariables();
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
        RBBIDebugPrintf("Parse tree after flattening variable references.\n");
        fTree->printTree(TRUE);
    }

    //
    // Add a unique right-end marker to the expression.
    //   Appears as a cat-node, left child being the original tree,
    //   right child being the end marker.
    //
    RBBINode *cn = new RBBINode(RBBINode::opCat);
    cn->fLeftChild = fTree;
    fTree->fParent = cn;
    cn->fRightChild = new RBBINode(RBBINode::endMark);
    cn->fRightChild->fParent = cn;
    fTree = cn;

    //
    //  Replace all references to UnicodeSets with the tree for the equivalent
    //      expression.
    //
    fTree->flattenSets();
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
        RBBIDebugPrintf("Parse tree after flattening Unicode Set references.\n");
        fTree->printTree(TRUE);
    }


    //
    // calculate the functions nullable, firstpos, lastpos and followpos on
    // nodes in the parse tree.
    //    See the alogrithm description in Aho.
    //    Understanding how this works by looking at the code alone will be
    //       nearly impossible.
    //
    calcNullable(fTree);
    calcFirstPos(fTree);
    calcLastPos(fTree);
    calcFollowPos(fTree);
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
        RBBIDebugPrintf("\n\n");
        printPosSets(fTree);
    }

    //
    //  For "chained" rules, modify the followPos sets
    //
    if (fRB->fChainRules) {
        calcChainedFollowPos(fTree);
    }

    //
    // Build the DFA state transition tables.
    //
    buildStateTable();
    flagAcceptingStates();
    flagLookAheadStates();
    flagTaggedStates();

    //
    // Update the global table of rule status {tag} values
    // The rule builder has a global vector of status values that are common
    //    for all tables.  Merge the ones from this table into the global set.
    //
    mergeRuleStatusVals();

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
}
Esempio n. 23
0
void RBBITableBuilder::printStates() {
    int     c;    // input "character"
    int     n;    // state number

    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
    RBBIDebugPrintf("      | Acc  LA    Tag");
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
        RBBIDebugPrintf(" %2d", c);
    }
    RBBIDebugPrintf("\n");
    RBBIDebugPrintf("      |---------------");
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
        RBBIDebugPrintf("---");
    }
    RBBIDebugPrintf("\n");

    for (n=0; n<fDStates->size(); n++) {
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
        RBBIDebugPrintf("  %3d | " , n);
        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
            RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
        }
        RBBIDebugPrintf("\n");
    }
    RBBIDebugPrintf("\n\n");
}
Esempio n. 24
0
void  RBBIDataWrapper::printTable(const char * heading, const RBBIStateTable * table)
{
	uint32_t   c;
	uint32_t   s;

	RBBIDebugPrintf("   %s\n", heading);

	RBBIDebugPrintf("State |  Acc  LA TagIx");
	for (c = 0; c < fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
	RBBIDebugPrintf("\n------|---------------");
	for (c = 0; c < fHeader->fCatCount; c++)
	{
		RBBIDebugPrintf("----");
	}
	RBBIDebugPrintf("\n");

	if (table == NULL)
	{
		RBBIDebugPrintf("         N U L L   T A B L E\n\n");
		return;
	}
	for (s = 0; s < table->fNumStates; s++)
	{
		RBBIStateTableRow * row = (RBBIStateTableRow *)
		                          (table->fTableData + (table->fRowLen * s));
		RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
		for (c = 0; c < fHeader->fCatCount; c++)
		{
			RBBIDebugPrintf("%3d ", row->fNextState[c]);
		}
		RBBIDebugPrintf("\n");
	}
	RBBIDebugPrintf("\n");
}