void RBBISetBuilder::printSets() { int i; RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); for (i=0; ; i++) { RBBINode *usetNode; RBBINode *setRef; RBBINode *varRef; UnicodeString setName; usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); if (usetNode == NULL) { break; } RBBIDebugPrintf("%3d ", i); setName = UNICODE_STRING("anonymous", 9); setRef = usetNode->fParent; if (setRef != NULL) { varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); RBBI_DEBUG_printUnicodeString(usetNode->fText); RBBIDebugPrintf("\n"); if (usetNode->fLeftChild != NULL) { usetNode->fLeftChild->printTree(TRUE); } } RBBIDebugPrintf("\n"); }
void RBBINode::printNode() { static const char * const nodeTypeNames[] = { "setRef", "uset", "varRef", "leafChar", "lookAhead", "tag", "endMark", "opStart", "opCat", "opOr", "opStar", "opPlus", "opQuestion", "opBreak", "opReverse", "opLParen" }; if (this==NULL) { RBBIDebugPrintf("%10p", (void *)this); } else { RBBIDebugPrintf("%10p %12s %10p %10p %10p %4d %6d %d ", (void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild, fSerialNum, fFirstPos, fVal); if (fType == varRef) { RBBI_DEBUG_printUnicodeString(fText); } } RBBIDebugPrintf("\n"); }
void RBBISetBuilder::printRanges() { RangeDescriptor * rlRange; int i; RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); for (rlRange = fRangeList; rlRange != 0; rlRange = rlRange->fNext) { RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); for (i = 0; i < rlRange->fIncludesSets->size(); i++) { RBBINode * usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName = UNICODE_STRING("anon", 4); RBBINode * setRef = usetNode->fParent; if (setRef != NULL) { RBBINode * varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } RBBIDebugPrintf("\n"); } }
void RBBISymbolTable::rbbiSymtablePrint() const { RBBIDebugPrintf("Variable Definitions\n" "Name Node Val String Val\n" "----------------------------------------------------------------------\n"); int32_t pos = -1; const UHashElement *e = NULL; for (;;) { e = uhash_nextElement(fHashTable, &pos); if (e == NULL ) { break; } RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer; RBBI_DEBUG_printUnicodeString(s->key, 15); RBBIDebugPrintf(" %8p ", (void *)s->val); RBBI_DEBUG_printUnicodeString(s->val->fLeftChild->fText); RBBIDebugPrintf("\n"); } RBBIDebugPrintf("\nParsed Variable Definitions\n"); pos = -1; for (;;) { e = uhash_nextElement(fHashTable, &pos); if (e == NULL ) { break; } RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer; RBBI_DEBUG_printUnicodeString(s->key); s->val->fLeftChild->printTree(TRUE); RBBIDebugPrintf("\n"); } }
void RBBITableBuilder::printSet(UVector *s) { int32_t i; for (i=0; i<s->size(); i++) { const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i)); RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum); } RBBIDebugPrintf("\n"); }
void RBBITableBuilder::printSet(UVector *s) { int32_t i; for (i=0; i<s->size(); i++) { void *v = s->elementAt(i); RBBIDebugPrintf("%10p", v); } RBBIDebugPrintf("\n"); }
U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth) { int i; for (i=0; i<s.length(); i++) { RBBIDebugPrintf("%c", s.charAt(i)); // putc(s.charAt(i), stdout); } for (i=s.length(); i<minWidth; i++) { RBBIDebugPrintf(" "); } }
void RBBIRuleScanner::printNodeStack(const char *title) { int i; RBBIDebugPrintf("%s. Dumping node stack...\n", title); for (i=fNodeStackPtr; i>0; i--) { fNodeStack[i]->printTree(TRUE); } }
void RBBITableBuilder::printPosSets(RBBINode *n) { if (n==NULL) { return; } n->printNode(); RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE"); RBBIDebugPrintf(" firstpos: "); printSet(n->fFirstPosSet); RBBIDebugPrintf(" lastpos: "); printSet(n->fLastPosSet); RBBIDebugPrintf(" followpos: "); printSet(n->fFollowPos); printPosSets(n->fLeftChild); printPosSets(n->fRightChild); }
void RBBISetBuilder::printRangeGroups() { RangeDescriptor * rlRange; RangeDescriptor * tRange; int i; int lastPrintedGroupNum = 0; RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); for (rlRange = fRangeList; rlRange != 0; rlRange = rlRange->fNext) { int groupNum = rlRange->fNum & 0xbfff; if (groupNum > lastPrintedGroupNum) { lastPrintedGroupNum = groupNum; RBBIDebugPrintf("%2i ", groupNum); if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");} for (i = 0; i < rlRange->fIncludesSets->size(); i++) { RBBINode * usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName = UNICODE_STRING("anon", 4); RBBINode * setRef = usetNode->fParent; if (setRef != NULL) { RBBINode * varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } i = 0; for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { if (tRange->fNum == rlRange->fNum) { if (i++ % 5 == 0) { RBBIDebugPrintf("\n "); } RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); } } RBBIDebugPrintf("\n"); } } RBBIDebugPrintf("\n"); }
void RBBISymbolTable::rbbiSymtablePrint() const { RBBIDebugPrintf("Variable Definitions Symbol Table\n" "Name Node serial String Val\n" "-------------------------------------------------------------------\n"); int32_t pos = UHASH_FIRST; const UHashElement *e = NULL; for (;;) { e = uhash_nextElement(fHashTable, &pos); if (e == NULL ) { break; } RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer; RBBIDebugPrintf("%-19s %8p %7d ", CStr(s->key)(), (void *)s->val, s->val->fSerialNum); RBBIDebugPrintf(" %s\n", CStr(s->val->fLeftChild->fText)()); } RBBIDebugPrintf("\nParsed Variable Definitions\n"); pos = -1; for (;;) { e = uhash_nextElement(fHashTable, &pos); if (e == NULL ) { break; } RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer; RBBIDebugPrintf("%s\n", CStr(s->key)()); RBBINode::printTree(s->val, TRUE); RBBINode::printTree(s->val->fLeftChild, FALSE); RBBIDebugPrintf("\n"); } }
//--------------------------------------------------------------------------------- // // pushNewNode create a new RBBINode of the specified type and push it // onto the stack of nodes. // //--------------------------------------------------------------------------------- RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { fNodeStackPtr++; if (fNodeStackPtr >= kStackSize) { error(U_BRK_INTERNAL_ERROR); RBBIDebugPrintf("RBBIRuleScanner::pushNewNode - stack overflow.\n"); *fRB->fStatus = U_BRK_INTERNAL_ERROR; return NULL; } fNodeStack[fNodeStackPtr] = new RBBINode(t); if (fNodeStack[fNodeStackPtr] == NULL) { *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; } return fNodeStack[fNodeStackPtr]; }
void RBBIDataWrapper::printData() { RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); RBBIDebugPrintf(" Version = %d\n", fHeader->fVersion); RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); printTable("Forward State Transition Table", fForwardTable); printTable("Reverse State Transition Table", fReverseTable); printTable("Safe Forward State Transition Table", fSafeFwdTable); printTable("Safe Reverse State Transition Table", fSafeRevTable); RBBIDebugPrintf("\nOrignal Rules source:\n"); for (int32_t c=0; fRuleSource[c] != 0; c++) { RBBIDebugPrintf("%c", fRuleSource[c]); } RBBIDebugPrintf("\n\n"); }
//---------------------------------------------------------------------------------------- // // fixOpStack The parse stack holds partially assembled chunks of the parse tree. // An entry on the stack may be as small as a single setRef node, // or as large as the parse tree // for an entire expression (this will be the one item left on the stack // when the parsing of an RBBI rule completes. // // This function is called when a binary operator is encountered. // It looks back up the stack for operators that are not yet associated // with a right operand, and if the precedence of the stacked operator >= // the precedence of the current operator, binds the operand left, // to the previously encountered operator. // //---------------------------------------------------------------------------------------- void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { RBBINode *n; // printNodeStack("entering fixOpStack()"); for (;;) { n = fNodeStack[fNodeStackPtr-1]; // an operator node if (n->fPrecedence == 0) { RBBIDebugPrintf("RBBIRuleScanner::fixOpStack, bad operator node\n"); error(U_BRK_INTERNAL_ERROR); return; } if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { // The most recent operand goes with the current operator, // not with the previously stacked one. break; } // Stack operator is a binary op ( '|' or concatenation) // TOS operand becomes right child of this operator. // Resulting subexpression becomes the TOS operand. n->fRightChild = fNodeStack[fNodeStackPtr]; fNodeStack[fNodeStackPtr]->fParent = n; fNodeStackPtr--; // printNodeStack("looping in fixOpStack() "); } if (p <= RBBINode::precLParen) { // Scan is at a right paren or end of expression. // The scanned item must match the stack, or else there was an error. // Discard the left paren (or start expr) node from the stack, // leaving the completed (sub)expression as TOS. if (n->fPrecedence != p) { // Right paren encountered matched start of expression node, or // end of expression matched with a left paren node. error(U_BRK_MISMATCHED_PAREN); } fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; fNodeStackPtr--; // Delete the now-discarded LParen or Start node. delete n; } // printNodeStack("leaving fixOpStack()"); }
void RBBIDataWrapper::printData() { #ifdef RBBI_DEBUG RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); printTable("Forward State Transition Table", fForwardTable); printTable("Reverse State Transition Table", fReverseTable); RBBIDebugPrintf("\nOrignal Rules source:\n"); for (int32_t c=0; fRuleSource[c] != 0; c++) { RBBIDebugPrintf("%c", fRuleSource[c]); } RBBIDebugPrintf("\n\n"); #endif }
void RBBINode::printTree(UBool printHeading) { if (printHeading) { RBBIDebugPrintf( "-------------------------------------------------------------------\n" " Address type Parent LeftChild RightChild serial position value\n" ); } this->printNode(); if (this != NULL) { // Only dump the definition under a variable reference if asked to. // Unconditinally dump children of all other node types. if (fType != varRef) { if (fLeftChild != NULL) { fLeftChild->printTree(FALSE); } if (fRightChild != NULL) { fRightChild->printTree(FALSE); } } } }
void RBBITableBuilder::printRuleStatusTable() { int32_t thisRecord = 0; int32_t nextRecord = 0; int i; UVector *tbl = fRB->fRuleStatusVals; RBBIDebugPrintf("index | tags \n"); RBBIDebugPrintf("-------------------\n"); while (nextRecord < tbl->size()) { thisRecord = nextRecord; nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; RBBIDebugPrintf("%4d ", thisRecord); for (i=thisRecord+1; i<nextRecord; i++) { RBBIDebugPrintf(" %5d", tbl->elementAti(i)); } RBBIDebugPrintf("\n"); } RBBIDebugPrintf("\n\n"); }
//----------------------------------------------------------------------------------- // // handlePrevious() // // Iterate backwards, according to the logic of the reverse rules. // This version handles the exact style backwards rules. // // The logic of this function is very similar to handleNext(), above. // //----------------------------------------------------------------------------------- int32_t BreakIterator::handlePrevious(const RBBIStateTable *statetable) { int32_t state; int16_t category = 0; RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; int32_t lookaheadStatus = 0; int32_t result = 0; int32_t initialPosition = 0; int32_t lookaheadResult = 0; UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPuts("Handle Previous pos char state category"); } #endif // handlePrevious() never gets the rule status. // Flag the status as invalid; if the user ever asks for status, we will need // to back up, then re-find the break position using handleNext(), which does // get the status value. fLastStatusIndexValid = FALSE; fLastRuleStatusIndex = 0; // if we're already at the start of the text, return DONE. if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { return BreakIterator::DONE; } // Set up the starting char. initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); result = initialPosition; c = UTEXT_PREVIOUS32(fText); // Set the initial state for the state machine state = START_STATE; row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); category = 3; mode = RBBI_RUN; if (statetable->fFlags & RBBI_BOF_REQUIRED) { category = 2; mode = RBBI_START; } // loop until we reach the start of the text or transition to state 0 // for (;;) { if (c == U_SENTINEL) { // Reached end of input string. if (mode == RBBI_END || *(int32_t *)fHeader->fFormatVersion == 1 ) { // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. // (Or we have an old format binary rule file that does not support {eof}.) if (lookaheadResult < result) { // We ran off the end of the string with a pending look-ahead match. // Treat this as if the look-ahead condition had been met, and return // the match at the / position from the look-ahead rule. result = lookaheadResult; lookaheadStatus = 0; } else if (result == initialPosition) { // Ran off start, no match found. // move one index one (towards the start, since we are doing a previous()) UTEXT_SETNATIVEINDEX(fText, initialPosition); UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. } break; } // Run the loop one last time with the fake end-of-input character category. mode = RBBI_END; category = 1; } // // Get the char category. An incoming category of 1 or 2 means that // we are preset for doing the beginning or end of input, and // that we shouldn't get a category from an actual text input character. // if (mode == RBBI_RUN) { // look up the current character's character category, which tells us // which column in the state table to look at. // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, // not the size of the character going in, which is a UChar32. // UTRIE_GET16(&fTrie, c, category); // Check the dictionary bit in the character's category. // Counter is only used by dictionary based iterators (subclasses). // Chars that need to be handled by a dictionary have a flag bit set // in their category values. // if ((category & 0x4000) != 0) { fDictionaryCharCount++; // And off the dictionary flag bit. category &= ~0x4000; } } #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); if (0x20<=c && c<0x7f) { RBBIDebugPrintf("\"%c\" ", c); } else { RBBIDebugPrintf("%5x ", c); } RBBIDebugPrintf("%3d %3d\n", state, category); } #endif // State Transition - move machine to its next state // state = row->fNextState[category]; row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); if (row->fAccepting == -1) { // Match found, common case. result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } if (row->fLookAhead != 0) { if (lookaheadStatus != 0 && row->fAccepting == lookaheadStatus) { // Lookahead match is completed. result = lookaheadResult; lookaheadStatus = 0; // TODO: make a standalone hard break in a rule work. if (lookAheadHardBreak) { UTEXT_SETNATIVEINDEX(fText, result); return result; } // Look-ahead completed, but other rules may match further. Continue on // TODO: junk this feature? I don't think it's used anywhwere. goto continueOn; } int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); lookaheadResult = r; lookaheadStatus = row->fLookAhead; goto continueOn; } if (row->fAccepting != 0) { // Because this is an accepting state, any in-progress look-ahead match // is no longer relavant. Clear out the pending lookahead status. lookaheadStatus = 0; } continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no // longer match is possible, no matter what characters follow. break; } // Move (backwards) to the next character to process. // If this is a beginning-of-input loop iteration, don't advance // the input position. The next iteration will be processing the // first real input character. if (mode == RBBI_RUN) { c = UTEXT_PREVIOUS32(fText); } else { if (mode == RBBI_START) { mode = RBBI_RUN; } } } // The state machine is done. Check whether it found a match... // If the iterator failed to advance in the match engine, force it ahead by one. // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { UTEXT_SETNATIVEINDEX(fText, initialPosition); UTEXT_PREVIOUS32(fText); result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } // Leave the iterator at our result position. UTEXT_SETNATIVEINDEX(fText, result); #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf("result = %d\n\n", result); } #endif return result; }
//--------------------------------------------------------------------------------- // // Parse RBBI rules. The state machine for rules parsing is here. // The state tables are hand-written in the file rbbirpt.txt, // and converted to the form used here by a perl // script rbbicst.pl // //--------------------------------------------------------------------------------- void RBBIRuleScanner::parse() { uint16_t state; const RBBIRuleTableEl *tableEl; if (U_FAILURE(*fRB->fStatus)) { return; } state = 1; nextChar(fC); // // Main loop for the rule parsing state machine. // Runs once per state transition. // Each time through optionally performs, depending on the state table, // - an advance to the the next input char // - an action to be performed. // - pushing or popping a state to/from the local state return stack. // for (;;) { // Bail out if anything has gone wrong. // RBBI rule file parsing stops on the first error encountered. if (U_FAILURE(*fRB->fStatus)) { break; } // Quit if state == 0. This is the normal way to exit the state machine. // if (state == 0) { break; } // Find the state table element that matches the input char from the rule, or the // class of the input character. Start with the first table row for this // state, then linearly scan forward until we find a row that matches the // character. The last row for each state always matches all characters, so // the search will stop there, if not before. // tableEl = &gRuleParseStateTable[state]; #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); } #endif for (;;) { #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); } #endif if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) { // Table row specified an individual character, not a set, and // the input character is not escaped, and // the input character matched it. break; } if (tableEl->fCharClass == 255) { // Table row specified default, match anything character class. break; } if (tableEl->fCharClass == 254 && fC.fEscaped) { // Table row specified "escaped" and the char was escaped. break; } if (tableEl->fCharClass == 253 && fC.fEscaped && (fC.fChar == 0x50 || fC.fChar == 0x70 )) { // Table row specified "escaped P" and the char is either 'p' or 'P'. break; } if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { // Table row specified eof and we hit eof on the input. break; } if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && fC.fEscaped == FALSE && // char is not escaped && fC.fChar != (UChar32)-1) { // char is not EOF UnicodeSet *uniset = fRuleSets[tableEl->fCharClass-128]; if (uniset->contains(fC.fChar)) { // Table row specified a character class, or set of characters, // and the current char matches it. break; } } // No match on this row, advance to the next row for this state, tableEl++; } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts(""); } // // We've found the row of the state table that matches the current input // character from the rules string. // Perform any action specified by this row in the state table. if (doParseActions((EParseAction)tableEl->fAction) == FALSE) { // Break out of the state machine loop if the // the action signalled some kind of error, or // the action was to exit, occurs on normal end-of-rules-input. break; } if (tableEl->fPushState != 0) { fStackPtr++; if (fStackPtr >= kStackSize) { error(U_BRK_INTERNAL_ERROR); RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); fStackPtr--; } fStack[fStackPtr] = tableEl->fPushState; } if (tableEl->fNextChar) { nextChar(fC); } // Get the next state from the table entry, or from the // state stack if the next state was specified as "pop". if (tableEl->fNextState != 255) { state = tableEl->fNextState; } else { state = fStack[fStackPtr]; fStackPtr--; if (fStackPtr < 0) { error(U_BRK_INTERNAL_ERROR); RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); fStackPtr++; } } } // // If there were NO user specified reverse rules, set up the equivalent of ".*;" // if (fRB->fReverseTree == NULL) { fRB->fReverseTree = pushNewNode(RBBINode::opStar); RBBINode *operand = pushNewNode(RBBINode::setRef); findSetFor(kAny, operand); fRB->fReverseTree->fLeftChild = operand; operand->fParent = fRB->fReverseTree; fNodeStackPtr -= 2; } // // Parsing of the input RBBI rules is complete. // We now have a parse tree for the rule expressions // and a list of all UnicodeSets that are referenced. // #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) { fSymbolTable->rbbiSymtablePrint(); } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) { RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); fRB->fForwardTree->printTree(TRUE); RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); fRB->fReverseTree->printTree(TRUE); RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); fRB->fSafeFwdTree->printTree(TRUE); RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); fRB->fSafeRevTree->printTree(TRUE); } #endif }
//--------------------------------------------------------------------------------- // // scanSet Construct a UnicodeSet from the text at the current scan // position. Advance the scan position to the first character // after the set. // // A new RBBI setref node referring to the set is pushed onto the node // stack. // // The scan position is normally under the control of the state machine // that controls rule parsing. UnicodeSets, however, are parsed by // the UnicodeSet constructor, not by the RBBI rule parser. // //--------------------------------------------------------------------------------- void RBBIRuleScanner::scanSet() { UnicodeSet *uset; ParsePosition pos; int startPos; int i; if (U_FAILURE(*fRB->fStatus)) { return; } pos.setIndex(fScanIndex); startPos = fScanIndex; UErrorCode localStatus = U_ZERO_ERROR; uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE, fSymbolTable, localStatus); if (U_FAILURE(localStatus)) { // TODO: Get more accurate position of the error from UnicodeSet's return info. // UnicodeSet appears to not be reporting correctly at this time. #ifdef RBBI_DEBUG RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); #endif error(localStatus); delete uset; return; } // Verify that the set contains at least one code point. // if (uset->isEmpty()) { // This set is empty. // Make it an error, because it almost certainly is not what the user wanted. // Also, avoids having to think about corner cases in the tree manipulation code // that occurs later on. error(U_BRK_RULE_EMPTY_SET); delete uset; return; } // Advance the RBBI parse postion over the UnicodeSet pattern. // Don't just set fScanIndex because the line/char positions maintained // for error reporting would be thrown off. i = pos.getIndex(); for (;;) { if (fNextIndex >= i) { break; } nextCharLL(); } if (U_SUCCESS(*fRB->fStatus)) { RBBINode *n; n = pushNewNode(RBBINode::setRef); n->fFirstPos = startPos; n->fLastPos = fNextIndex; fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); // findSetFor() serves several purposes here: // - Adopts storage for the UnicodeSet, will be responsible for deleting. // - Mantains collection of all sets in use, needed later for establishing // character categories for run time engine. // - Eliminates mulitiple instances of the same set. // - Creates a new uset node if necessary (if this isn't a duplicate.) findSetFor(n->fText, n, uset); } }
//----------------------------------------------------------------------------- // // print - debugging function to dump the runtime data tables. // //----------------------------------------------------------------------------- void RBBIDataWrapper::printData() { #ifdef RBBI_DEBUG uint32_t c, s; RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); RBBIDebugPrintf(" Version = %d\n", fHeader->fVersion); RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); RBBIDebugPrintf(" Forward State Transition Table\n"); RBBIDebugPrintf("State | Acc LA Tag"); for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {RBBIDebugPrintf("----");} RBBIDebugPrintf("\n"); for (s=0; s<fForwardTable->fNumStates; s++) { RBBIStateTableRow *row = (RBBIStateTableRow *) (fForwardTable->fTableData + (fForwardTable->fRowLen * s)); RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTag); for (c=0; c<fHeader->fCatCount; c++) { RBBIDebugPrintf("%3d ", row->fNextState[c]); } RBBIDebugPrintf("\n"); } RBBIDebugPrintf("\nOrignal Rules source:\n"); c = 0; for (;;) { if (fRuleSource[c] == 0) break; RBBIDebugPrintf("%c", fRuleSource[c]); c++; } RBBIDebugPrintf("\n\n"); #endif }
//----------------------------------------------------------------------------- // // RBBITableBuilder::build - This is the main function for building the DFA state transtion // table from the RBBI rules parse tree. // //----------------------------------------------------------------------------- void RBBITableBuilder::build() { if (U_FAILURE(*fStatus)) { return; } // If there were no rules, just return. This situation can easily arise // for the reverse rules. if (fTree==NULL) { return; } // // Walk through the tree, replacing any references to $variables with a copy of the // parse tree for the substition expression. // fTree = fTree->flattenVariables(); if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { RBBIDebugPrintf("Parse tree after flattening variable references.\n"); fTree->printTree(TRUE); } // // Add a unique right-end marker to the expression. // Appears as a cat-node, left child being the original tree, // right child being the end marker. // RBBINode *cn = new RBBINode(RBBINode::opCat); cn->fLeftChild = fTree; fTree->fParent = cn; cn->fRightChild = new RBBINode(RBBINode::endMark); cn->fRightChild->fParent = cn; fTree = cn; // // Replace all references to UnicodeSets with the tree for the equivalent // expression. // fTree->flattenSets(); if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { RBBIDebugPrintf("Parse tree after flattening Unicode Set references.\n"); fTree->printTree(TRUE); } // // calculate the functions nullable, firstpos, lastpos and followpos on // nodes in the parse tree. // See the alogrithm description in Aho. // Understanding how this works by looking at the code alone will be // nearly impossible. // calcNullable(fTree); calcFirstPos(fTree); calcLastPos(fTree); calcFollowPos(fTree); if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) { RBBIDebugPrintf("\n\n"); printPosSets(fTree); } // // For "chained" rules, modify the followPos sets // if (fRB->fChainRules) { calcChainedFollowPos(fTree); } // // Build the DFA state transition tables. // buildStateTable(); flagAcceptingStates(); flagLookAheadStates(); flagTaggedStates(); // // Update the global table of rule status {tag} values // The rule builder has a global vector of status values that are common // for all tables. Merge the ones from this table into the global set. // mergeRuleStatusVals(); if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();}; }
void RBBITableBuilder::printStates() { int c; // input "character" int n; // state number RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); RBBIDebugPrintf(" | Acc LA Tag"); for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { RBBIDebugPrintf(" %2d", c); } RBBIDebugPrintf("\n"); RBBIDebugPrintf(" |---------------"); for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { RBBIDebugPrintf("---"); } RBBIDebugPrintf("\n"); for (n=0; n<fDStates->size(); n++) { RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); RBBIDebugPrintf(" %3d | " , n); RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c)); } RBBIDebugPrintf("\n"); } RBBIDebugPrintf("\n\n"); }
void RBBIDataWrapper::printTable(const char * heading, const RBBIStateTable * table) { uint32_t c; uint32_t s; RBBIDebugPrintf(" %s\n", heading); RBBIDebugPrintf("State | Acc LA TagIx"); for (c = 0; c < fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} RBBIDebugPrintf("\n------|---------------"); for (c = 0; c < fHeader->fCatCount; c++) { RBBIDebugPrintf("----"); } RBBIDebugPrintf("\n"); if (table == NULL) { RBBIDebugPrintf(" N U L L T A B L E\n\n"); return; } for (s = 0; s < table->fNumStates; s++) { RBBIStateTableRow * row = (RBBIStateTableRow *) (table->fTableData + (table->fRowLen * s)); RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); for (c = 0; c < fHeader->fCatCount; c++) { RBBIDebugPrintf("%3d ", row->fNextState[c]); } RBBIDebugPrintf("\n"); } RBBIDebugPrintf("\n"); }