void BytesTrieTest::checkNextWithState(BytesTrie &trie, const StringAndValue data[], int32_t dataLength) { BytesTrie::State noState, state; for(int32_t i=0; i<dataLength; ++i) { if((i&1)==0) { // This should have no effect. trie.resetToState(noState); } const char *expectedString=data[i].s; int32_t stringLength=strlen(expectedString); int32_t partialLength=stringLength/3; for(int32_t j=0; j<partialLength; ++j) { if(!USTRINGTRIE_MATCHES(trie.next(expectedString[j]))) { errln("trie.next()=USTRINGTRIE_NO_MATCH for a prefix of %s", data[i].s); return; } } trie.saveState(state); UStringTrieResult resultAtState=trie.current(); UStringTrieResult result; int32_t valueAtState=-99; if(USTRINGTRIE_HAS_VALUE(resultAtState)) { valueAtState=trie.getValue(); } result=trie.next(0); // mismatch if(result!=USTRINGTRIE_NO_MATCH || result!=trie.current()) { errln("trie.next(0) matched after part of %s", data[i].s); } if( resultAtState!=trie.resetToState(state).current() || (USTRINGTRIE_HAS_VALUE(resultAtState) && valueAtState!=trie.getValue()) ) { errln("trie.next(part of %s) changes current()/getValue() after " "saveState/next(0)/resetToState", data[i].s); } else if(!USTRINGTRIE_HAS_VALUE( result=trie.next(expectedString+partialLength, stringLength-partialLength)) || result!=trie.current()) { errln("trie.next(rest of %s) does not seem to contain %s after " "saveState/next(0)/resetToState", data[i].s, data[i].s); } else if(!USTRINGTRIE_HAS_VALUE( result=trie.resetToState(state). next(expectedString+partialLength, stringLength-partialLength)) || result!=trie.current()) { errln("trie does not seem to contain %s after saveState/next(rest)/resetToState", data[i].s); } else if(trie.getValue()!=data[i].value) { errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx", data[i].s, (long)trie.getValue(), (long)trie.getValue(), (long)data[i].value, (long)data[i].value); } trie.reset(); } }
virtual void call(UErrorCode * /*pErrorCode*/) { if(noDict) { return; } const ULine *lines=perf.getCachedLines(); int32_t numLines=perf.getNumLines(); for(int32_t i=0; i<numLines; ++i) { const UChar *line=lines[i].name; // Skip comment lines (start with a character below 'A'). if(line[0]<0x41) { continue; } UStringTrieResult result=trie->first(thaiCharToByte(line[0])); int32_t lineLength=lines[i].len; for(int32_t j=1; j<lineLength; ++j) { if(!USTRINGTRIE_HAS_NEXT(result)) { fprintf(stderr, "word %ld (0-based) not found\n", (long)i); break; } result=trie->next(thaiCharToByte(line[j])); } if(!USTRINGTRIE_HAS_VALUE(result)) { fprintf(stderr, "word %ld (0-based) not found\n", (long)i); } } }
static int32_t bytesTrieMatches(BytesTrie &trie, UText *text, int32_t textLimit, int32_t *lengths, int &count, int limit ) { UChar32 c=utext_next32(text); if(c<0) { return 0; } UStringTrieResult result=trie.first(thaiCharToByte(c)); int32_t numChars=1; count=0; for(;;) { if(USTRINGTRIE_HAS_VALUE(result)) { if(count<limit) { // lengths[count++]=(int32_t)utext_getNativeIndex(text); lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. } if(result==USTRINGTRIE_FINAL_VALUE) { break; } } else if(result==USTRINGTRIE_NO_MATCH) { break; } if(numChars>=textLimit) { break; } UChar32 c=utext_next32(text); if(c<0) { break; } ++numChars; result=trie.next(thaiCharToByte(c)); } return numChars; }
static int32_t bytesTrieLookup(const char *s, const char *nameTrieBytes) { BytesTrie trie(nameTrieBytes); if(USTRINGTRIE_HAS_VALUE(trie.next(s, -1))) { return trie.getValue(); } else { return -1; } }
// Closely imitate CompactTrieDictionary::matches(). // Note: CompactTrieDictionary::matches() is part of its trie implementation, // and while it loops over the text, it knows the current state. // By contrast, this implementation uses UCharsTrie API functions that have to // check the trie state each time and load/store state in the object. // (Whether it hasNext() and whether it is in the middle of a linear-match node.) static int32_t ucharsTrieMatches(UCharsTrie &trie, UText *text, int32_t textLimit, int32_t *lengths, int &count, int limit ) { UChar32 c=utext_next32(text); // Notes: // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. // b) It also ignores non-BMP code points by casting to UChar! if(c<0) { return 0; } // Should be firstForCodePoint() but CompactTrieDictionary // handles only code units. UStringTrieResult result=trie.first(c); int32_t numChars=1; count=0; for(;;) { if(USTRINGTRIE_HAS_VALUE(result)) { if(count<limit) { // lengths[count++]=(int32_t)utext_getNativeIndex(text); lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. } if(result==USTRINGTRIE_FINAL_VALUE) { break; } } else if(result==USTRINGTRIE_NO_MATCH) { break; } if(numChars>=textLimit) { // Note: Why do we have both a text limit and a UText that knows its length? break; } UChar32 c=utext_next32(text); // Notes: // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. // b) It also ignores non-BMP code points by casting to UChar! if(c<0) { break; } ++numChars; // Should be nextForCodePoint() but CompactTrieDictionary // handles only code units. result=trie.next(c); } #if 0 // Note: CompactTrieDictionary::matches() comments say that it leaves the UText // after the longest prefix match and returns the number of characters // that were matched. if(index!=lastMatch) { utext_setNativeIndex(text, lastMatch); } return lastMatch-start; // However, it does not do either of these, so I am not trying to // imitate it (or its docs) 100%. #endif return numChars; }
void BytesTrieTest::checkFirst(BytesTrie &trie, const StringAndValue data[], int32_t dataLength) { for(int32_t i=0; i<dataLength; ++i) { int c=*data[i].s; if(c==0) { continue; // skip empty string } UStringTrieResult firstResult=trie.first(c); int32_t firstValue=USTRINGTRIE_HAS_VALUE(firstResult) ? trie.getValue() : -1; UStringTrieResult nextResult=trie.next(data[i].s[1]); if(firstResult!=trie.reset().next(c) || firstResult!=trie.current() || firstValue!=(USTRINGTRIE_HAS_VALUE(firstResult) ? trie.getValue() : -1) || nextResult!=trie.next(data[i].s[1]) ) { errln("trie.first(%c)!=trie.reset().next(same) for %s", c, data[i].s); } } trie.reset(); }
virtual void call(UErrorCode * /*pErrorCode*/) { const ULine *lines=perf.getCachedLines(); int32_t numLines=perf.getNumLines(); for(int32_t i=0; i<numLines; ++i) { // Skip comment lines (which start with a character below 'A'). if(lines[i].name[0]<0x41) { continue; } if(!USTRINGTRIE_HAS_VALUE(trie->reset().next(lines[i].name, lines[i].len))) { fprintf(stderr, "word %ld (0-based) not found\n", (long)i); } } }
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, int32_t *lengths, int32_t *cpLengths, int32_t *values, int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { UCharsTrie uct(characters); int32_t startingTextIndex = utext_getNativeIndex(text); int32_t wordCount = 0; int32_t codePointsMatched = 0; for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex; codePointsMatched += 1; if (ignoreSet != NULL && ignoreSet->contains(c)) { continue; } if (USTRINGTRIE_HAS_VALUE(result)) { if (codePointsMatched < minLength) { continue; } if (wordCount < limit) { if (values != NULL) { values[wordCount] = uct.getValue(); } if (lengths != NULL) { lengths[wordCount] = lengthMatched; } if (cpLengths != NULL) { cpLengths[wordCount] = codePointsMatched; } ++wordCount; } if (result == USTRINGTRIE_FINAL_VALUE) { break; } } else if (result == USTRINGTRIE_NO_MATCH) { break; } if (lengthMatched >= maxLength) { break; } } if (prefix != NULL) { *prefix = codePointsMatched; } return wordCount; }
UBool PropNameData::containsName(BytesTrie &trie, const char *name) { if(name==NULL) { return FALSE; } UStringTrieResult result=USTRINGTRIE_NO_VALUE; char c; while((c=*name++)!=0) { c=uprv_invCharToLowercaseAscii(c); // Ignore delimiters '-', '_', and ASCII White_Space. if(c==0x2d || c==0x5f || c==0x20 || (0x09<=c && c<=0x0d)) { continue; } if(!USTRINGTRIE_HAS_NEXT(result)) { return FALSE; } result=trie.next((uint8_t)c); } return USTRINGTRIE_HAS_VALUE(result); }
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, int32_t *lengths, int32_t *cpLengths, int32_t *values, int32_t *prefix) const { BytesTrie bt(characters); int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); int32_t wordCount = 0; int32_t codePointsMatched = 0; for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; codePointsMatched += 1; if (USTRINGTRIE_HAS_VALUE(result)) { if (wordCount < limit) { if (values != NULL) { values[wordCount] = bt.getValue(); } if (lengths != NULL) { lengths[wordCount] = lengthMatched; } if (cpLengths != NULL) { cpLengths[wordCount] = codePointsMatched; } ++wordCount; } if (result == USTRINGTRIE_FINAL_VALUE) { break; } } else if (result == USTRINGTRIE_NO_MATCH) { break; } if (lengthMatched >= maxLength) { break; } } if (prefix != NULL) { *prefix = codePointsMatched; } return wordCount; }
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { UCharsTrie uct(characters); UChar32 c = utext_next32(text); if (c < 0) { return 0; } UStringTrieResult result = uct.first(c); int32_t numChars = 1; count = 0; for (;;) { if (USTRINGTRIE_HAS_VALUE(result)) { if (count < limit) { if (values != NULL) { values[count] = uct.getValue(); } lengths[count++] = numChars; } if (result == USTRINGTRIE_FINAL_VALUE) { break; } } else if (result == USTRINGTRIE_NO_MATCH) { break; } // TODO: why do we have a text limit if the UText knows its length? if (numChars >= maxLength) { break; } c = utext_next32(text); if (c < 0) { break; } ++numChars; result = uct.next(c); } return numChars; }
void BytesTrieTest::checkNext(BytesTrie &trie, const StringAndValue data[], int32_t dataLength) { BytesTrie::State state; for(int32_t i=0; i<dataLength; ++i) { int32_t stringLength= (i&1) ? -1 : strlen(data[i].s); UStringTrieResult result; if( !USTRINGTRIE_HAS_VALUE(result=trie.next(data[i].s, stringLength)) || result!=trie.current() ) { errln("trie does not seem to contain %s", data[i].s); } else if(trie.getValue()!=data[i].value) { errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx", data[i].s, (long)trie.getValue(), (long)trie.getValue(), (long)data[i].value, (long)data[i].value); } else if(result!=trie.current() || trie.getValue()!=data[i].value) { errln("trie value for %s changes when repeating current()/getValue()", data[i].s); } trie.reset(); stringLength=strlen(data[i].s); result=trie.current(); for(int32_t j=0; j<stringLength; ++j) { if(!USTRINGTRIE_HAS_NEXT(result)) { errln("trie.current()!=hasNext before end of %s (at index %d)", data[i].s, j); break; } if(result==USTRINGTRIE_INTERMEDIATE_VALUE) { trie.getValue(); if(trie.current()!=USTRINGTRIE_INTERMEDIATE_VALUE) { errln("trie.getValue().current()!=USTRINGTRIE_INTERMEDIATE_VALUE before end of %s (at index %d)", data[i].s, j); break; } } result=trie.next(data[i].s[j]); if(!USTRINGTRIE_MATCHES(result)) { errln("trie.next()=USTRINGTRIE_NO_MATCH before end of %s (at index %d)", data[i].s, j); break; } if(result!=trie.current()) { errln("trie.next()!=following current() before end of %s (at index %d)", data[i].s, j); break; } } if(!USTRINGTRIE_HAS_VALUE(result)) { errln("trie.next()!=hasValue at the end of %s", data[i].s); continue; } trie.getValue(); if(result!=trie.current()) { errln("trie.current() != current()+getValue()+current() after end of %s", data[i].s); } // Compare the final current() with whether next() can actually continue. trie.saveState(state); UBool nextContinues=FALSE; // Try all graphic characters; we only use those in test strings in this file. #if U_CHARSET_FAMILY==U_ASCII_FAMILY const int32_t minChar=0x20; const int32_t maxChar=0x7e; #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY const int32_t minChar=0x40; const int32_t maxChar=0xfe; #else const int32_t minChar=0; const int32_t maxChar=0xff; #endif for(int32_t c=minChar; c<=maxChar; ++c) { if(trie.resetToState(state).next(c)) { nextContinues=TRUE; break; } } if((result==USTRINGTRIE_INTERMEDIATE_VALUE)!=nextContinues) { errln("(trie.current()==USTRINGTRIE_INTERMEDIATE_VALUE) contradicts " "(trie.next(some byte)!=USTRINGTRIE_NO_MATCH) after end of %s", data[i].s); } trie.reset(); } }