void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { UScriptCode scripts[30]; int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); if (U_FAILURE(status)) { return; } if (status == U_USING_DEFAULT_WARNING) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } UnicodeSet tmpSet; int32_t i; for (i=0; i<numScripts; i++) { tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); allowedChars->addAll(tmpSet); } }
void NamesPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } if(!newValues.contains(UCHAR_NAME) && !newValues.contains(PPUCD_NAME_ALIAS)) { return; } U_ASSERT(props.start==props.end); const char *names[4]={ NULL, NULL, NULL, NULL }; int16_t lengths[4]={ 0, 0, 0, 0 }; /* get the character name */ if(props.name!=NULL) { names[0]=props.name; lengths[0]=(int16_t)uprv_strlen(props.name); parseName(names[0], lengths[0]); } CharString buffer; if(props.nameAlias!=NULL) { /* * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line. * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character. */ const char *corr=uprv_strstr(props.nameAlias, "correction="); if(corr!=NULL) { corr+=11; // skip "correction=" const char *limit=uprv_strchr(corr, ','); if(limit!=NULL) { buffer.append(corr, limit-corr, errorCode); names[3]=buffer.data(); lengths[3]=(int16_t)(limit-corr); } else { names[3]=corr; lengths[3]=(int16_t)uprv_strlen(corr); } parseName(names[3], lengths[3]); } } addLine(props.start, names, lengths, LENGTHOF(names)); }
void TransliteratorErrorTest::TestUnicodeSetErrors() { UnicodeString badPattern="[[:L:]-[0x0300-0x0400]"; UnicodeSet set; UErrorCode status = U_ZERO_ERROR; UnicodeString result; if (!set.isEmpty()) { errln("FAIL: The default ctor of UnicodeSet created a non-empty object."); } set.applyPattern(badPattern, status); if (U_SUCCESS(status)) { errln("FAIL: Applied a bad pattern to the UnicodeSet object okay."); } status = U_ZERO_ERROR; UnicodeSet *set1 = new UnicodeSet(badPattern, status); if (U_SUCCESS(status)) { errln("FAIL: Created a UnicodeSet based on bad patterns."); } delete set1; }
U_CAPI void U_EXPORT2 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) { SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (This == NULL) { return; } if (chars->isBogus()) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone()); if (clonedSet == NULL || clonedSet->isBogus()) { *status = U_MEMORY_ALLOCATION_ERROR; return; } clonedSet->freeze(); delete This->fAllowedCharsSet; This->fAllowedCharsSet = clonedSet; This->fChecks |= USPOOF_CHAR_LIMIT; }
static int32_t span(const UnicodeSet &set, const UChar *s, int32_t length, UBool tf) { UChar32 c; int32_t start=0, prev; while((prev=start)<length) { U16_NEXT(s, start, length, c); if(tf!=set.contains(c)) { break; } } return prev; }
void BiDiPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) { if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; } UChar32 start=props.start; UChar32 end=props.end; // The runtime code relies on this invariant for returning both bmg and bpb // from the same data. int32_t bpt=props.getIntProp(UCHAR_BIDI_PAIRED_BRACKET_TYPE); if(!(bpt==0 ? props.bpb==U_SENTINEL : props.bpb==props.bmg)) { fprintf(stderr, "genprops error: invariant not true: " "if(bpt==None) then bpb=<none> else bpb=bmg\n"); return; } int32_t delta=encodeBidiMirroringGlyph(start, end, props.bmg, errorCode); uint32_t value=(uint32_t)delta<<UBIDI_MIRROR_DELTA_SHIFT; if(props.binProps[UCHAR_BIDI_MIRRORED]) { value|=U_MASK(UBIDI_IS_MIRRORED_SHIFT); } if(props.binProps[UCHAR_BIDI_CONTROL]) { value|=U_MASK(UBIDI_BIDI_CONTROL_SHIFT); } if(props.binProps[UCHAR_JOIN_CONTROL]) { value|=U_MASK(UBIDI_JOIN_CONTROL_SHIFT); } value|=(uint32_t)bpt<<UBIDI_BPT_SHIFT; value|=(uint32_t)props.getIntProp(UCHAR_JOINING_TYPE)<<UBIDI_JT_SHIFT; value|=(uint32_t)props.getIntProp(UCHAR_BIDI_CLASS); utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops error: BiDiPropsBuilder utrie2_setRange32() failed - %s\n", u_errorName(errorCode)); return; } // Store Joining_Group values from vector column 1 in simple byte arrays. int32_t jg=props.getIntProp(UCHAR_JOINING_GROUP); for(UChar32 c=start; c<=end; ++c) { int32_t jgStart; if(MIN_JG_START<=c && c<MAX_JG_LIMIT) { jgArray[c-MIN_JG_START]=(uint8_t)jg; } else if(MIN_JG_START2<=c && c<MAX_JG_LIMIT2) { jgArray2[c-MIN_JG_START2]=(uint8_t)jg; } else if(jg!=U_JG_NO_JOINING_GROUP) { fprintf(stderr, "genprops error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n", (long)start, (long)end); errorCode=U_ILLEGAL_ARGUMENT_ERROR; return; } } }
void PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } scx.clear(); CharString scString; for(;;) { const char *scs; const char *scLimit=strchr(s, ' '); if(scLimit!=NULL) { scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); if(U_FAILURE(errorCode)) { return; } } else { scs=s; } int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); if(script==UCHAR_INVALID_CODE) { fprintf(stderr, "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", scs, (long)lineNumber); errorCode=U_PARSE_ERROR; return; } else if(scx.contains(script)) { fprintf(stderr, "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", scs, (long)lineNumber); errorCode=U_PARSE_ERROR; return; } else { scx.add(script); } if(scLimit!=NULL) { s=scLimit+1; } else { break; } } if(scx.isEmpty()) { fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); errorCode=U_PARSE_ERROR; } }
void AlphabeticIndex::buildBucketList(UErrorCode &status) { UnicodeString labelStr = getUnderflowLabel(); Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status); bucketList_->addElement(b, status); // Build up the list, adding underflow, additions, overflow // insert infix labels as needed, using \uFFFF. const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0)); b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status); bucketList_->addElement(b, status); UnicodeSet lastSet; UnicodeSet set; AlphabeticIndex::getScriptSet(lastSet, *last, status); lastSet.removeAll(*IGNORE_SCRIPTS); for (int i = 1; i < labels_->size(); ++i) { UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i)); getScriptSet(set, *current, status); set.removeAll(*IGNORE_SCRIPTS); if (lastSet.containsNone(set)) { // check for adjacent const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status); if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) { labelStr = getInflowLabel(); b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status); bucketList_->addElement(b, status); i++; lastSet = set; } } b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status); bucketList_->addElement(b, status); last = current; lastSet = set; } const UnicodeString &limitString = getOverflowComparisonString(*last, status); b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status); bucketList_->addElement(b, status); // final overflow bucket }
SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) { // Verify that the frozen set is equal to the unfrozen one. UnicodeSet set; char utf8[4]; UChar32 c; int32_t length; for(c=0; c<=0x10ffff; ++c) { if(c==0xd800) { c=0xe000; } length=0; U8_APPEND_UNSAFE(utf8, length, c); if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) { set.add(c); } } if(set!=testcase.set) { fprintf(stderr, "error: frozen set != original!\n"); } }
void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { UChar32 ch; for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) { ch = output.char32At(i); UnicodeReplacer* r = data->lookupReplacer(ch); if (r == NULL) { toUnionTo.add(ch); } else { r->addReplacementSetTo(toUnionTo); } } }
/** * Implement UnicodeMatcher */ void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { UChar32 ch; for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) { ch = pattern.char32At(i); const UnicodeMatcher* matcher = data->lookupMatcher(ch); if (matcher == NULL) { toUnionTo.add(ch); } else { matcher->addMatchSetTo(toUnionTo); } } }
/** * Union the set of all characters that may be modified by this rule * into the given set. */ void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { int32_t limit = anteContextLength + keyLength; for (int32_t i=anteContextLength; i<limit; ) { UChar32 ch = pattern.char32At(i); i += UTF_CHAR_LENGTH(ch); const UnicodeMatcher* matcher = data->lookupMatcher(ch); if (matcher == NULL) { toUnionTo.add(ch); } else { matcher->addMatchSetTo(toUnionTo); } } }
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } LocalPointer<UVector> dest(new UVector(status), status); if (U_FAILURE(status)) { return NULL; } dest->setDeleter(uprv_deleteUObject); // Fetch the script-first-primary contractions which are defined in the root collator. // They all start with U+FDD1. UnicodeSet set; collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); if (U_FAILURE(status)) { return NULL; } if (set.isEmpty()) { status = U_UNSUPPORTED_ERROR; return NULL; } UnicodeSetIterator iter(set); while (iter.next()) { const UnicodeString &boundary = iter.getString(); uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { // Ignore boundaries for the special reordering groups. // Take only those for "real scripts" (where the sample character is a Letter, // and the one for unassigned implicit weights (Cn). continue; } UnicodeString *s = new UnicodeString(boundary); if (s == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } dest->addElement(s, status); } return dest.orphan(); }
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { // Korean dictionary only includes Hangul syllables fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); if (U_SUCCESS(status)) { // handle Korean and Japanese/Chinese using different dictionaries if (type == kKorean) { setCharacters(fHangulWordSet); } else { //Chinese and Japanese UnicodeSet cjSet; cjSet.addAll(fHanWordSet); cjSet.addAll(fKatakanaWordSet); cjSet.addAll(fHiraganaWordSet); cjSet.add(0xFF70); cjSet.add(0x30FC); setCharacters(cjSet); } } }
void StaticUnicodeSetsTest::testSetCoverage() { UErrorCode status = U_ZERO_ERROR; // Lenient comma/period should be supersets of strict comma/period; // it also makes the coverage logic cheaper. assertTrue( "COMMA should be superset of STRICT_COMMA", get(unisets::COMMA)->containsAll(*get(unisets::STRICT_COMMA))); assertTrue( "PERIOD should be superset of STRICT_PERIOD", get(unisets::PERIOD)->containsAll(*get(unisets::STRICT_PERIOD))); UnicodeSet decimals; decimals.addAll(*get(unisets::STRICT_COMMA)); decimals.addAll(*get(unisets::STRICT_PERIOD)); decimals.freeze(); UnicodeSet grouping; grouping.addAll(decimals); grouping.addAll(*get(unisets::OTHER_GROUPING_SEPARATORS)); decimals.freeze(); const UnicodeSet &plusSign = *get(unisets::PLUS_SIGN); const UnicodeSet &minusSign = *get(unisets::MINUS_SIGN); const UnicodeSet &percent = *get(unisets::PERCENT_SIGN); const UnicodeSet &permille = *get(unisets::PERMILLE_SIGN); const UnicodeSet &infinity = *get(unisets::INFINITY_KEY); int32_t localeCount; const Locale* allAvailableLocales = Locale::getAvailableLocales(localeCount); for (int32_t i = 0; i < localeCount; i++) { Locale locale = allAvailableLocales[i]; DecimalFormatSymbols dfs(locale, status); UnicodeString localeName; locale.getDisplayName(localeName); assertSuccess(UnicodeString("Making DFS for ") + localeName, status); #define ASSERT_IN_SET(name, foo) assertInSet(localeName, UnicodeString("" #name ""), name, foo) ASSERT_IN_SET(decimals, dfs.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol)); ASSERT_IN_SET(grouping, dfs.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol)); ASSERT_IN_SET(plusSign, dfs.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol)); ASSERT_IN_SET(minusSign, dfs.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol)); ASSERT_IN_SET(percent, dfs.getConstSymbol(DecimalFormatSymbols::kPercentSymbol)); ASSERT_IN_SET(permille, dfs.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol)); ASSERT_IN_SET(infinity, dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol)); } }
//--------------------------------------------------------------------------------- // // scanSet Construct a UnicodeSet from the text at the current scan // position. Advance the scan position to the first character // after the set. // // A new RBBI setref node referring to the set is pushed onto the node // stack. // // The scan position is normally under the control of the state machine // that controls rule parsing. UnicodeSets, however, are parsed by // the UnicodeSet constructor, not by the RBBI rule parser. // //--------------------------------------------------------------------------------- void RBBIRuleScanner::scanSet() { UnicodeSet *uset; ParsePosition pos; int startPos; int i; if (U_FAILURE(*fRB->fStatus)) { return; } pos.setIndex(fScanIndex); startPos = fScanIndex; UErrorCode localStatus = U_ZERO_ERROR; uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE, fSymbolTable, localStatus); if (U_FAILURE(localStatus)) { // TODO: Get more accurate position of the error from UnicodeSet's return info. // UnicodeSet appears to not be reporting correctly at this time. #ifdef RBBI_DEBUG RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); #endif error(localStatus); delete uset; return; } // Verify that the set contains at least one code point. // if (uset->isEmpty()) { // This set is empty. // Make it an error, because it almost certainly is not what the user wanted. // Also, avoids having to think about corner cases in the tree manipulation code // that occurs later on. error(U_BRK_RULE_EMPTY_SET); delete uset; return; } // Advance the RBBI parse postion over the UnicodeSet pattern. // Don't just set fScanIndex because the line/char positions maintained // for error reporting would be thrown off. i = pos.getIndex(); for (;;) { if (fNextIndex >= i) { break; } nextCharLL(); } if (U_SUCCESS(*fRB->fStatus)) { RBBINode *n; n = pushNewNode(RBBINode::setRef); n->fFirstPos = startPos; n->fLastPos = fNextIndex; fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); // findSetFor() serves several purposes here: // - Adopts storage for the UnicodeSet, will be responsible for deleting. // - Mantains collection of all sets in use, needed later for establishing // character categories for run time engine. // - Eliminates mulitiple instances of the same set. // - Creates a new uset node if necessary (if this isn't a duplicate.) findSetFor(n->fText, n, uset); } }
/** * Parse the pattern from the given RuleCharacterIterator. The * iterator is advanced over the parsed pattern. * @param chars iterator over the pattern characters. Upon return * it will be advanced to the first character after the parsed * pattern, or the end of the iteration if all characters are * parsed. * @param symbols symbol table to use to parse and dereference * variables, or null if none. * @param rebuiltPat the pattern that was parsed, rebuilt or * copied from the input pattern, as appropriate. * @param options a bit mask of zero or more of the following: * IGNORE_SPACE, CASE. */ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, const SymbolTable* symbols, UnicodeString& rebuiltPat, uint32_t options, UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), UErrorCode& ec) { if (U_FAILURE(ec)) return; // Syntax characters: [ ] ^ - & { } // Recognized special forms for chars, sets: c-c s-s s&s int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | RuleCharacterIterator::PARSE_ESCAPES; if ((options & USET_IGNORE_SPACE) != 0) { opts |= RuleCharacterIterator::SKIP_WHITESPACE; } UnicodeString patLocal, buf; UBool usePat = FALSE; UnicodeSetPointer scratch; RuleCharacterIterator::Pos backup; // mode: 0=before [, 1=between [...], 2=after ] // lastItem: 0=none, 1=char, 2=set int8_t lastItem = 0, mode = 0; UChar32 lastChar = 0; UChar op = 0; UBool invert = FALSE; clear(); while (mode != 2 && !chars.atEnd()) { U_ASSERT((lastItem == 0 && op == 0) || (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/))); UChar32 c = 0; UBool literal = FALSE; UnicodeSet* nested = 0; // alias - do not delete // -------- Check for property pattern // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed int8_t setMode = 0; if (resemblesPropertyPattern(chars, opts)) { setMode = 2; } // -------- Parse '[' of opening delimiter OR nested set. // If there is a nested set, use `setMode' to define how // the set should be parsed. If the '[' is part of the // opening delimiter for this pattern, parse special // strings "[", "[^", "[-", and "[^-". Check for stand-in // characters representing a nested set in the symbol // table. else { // Prepare to backup if necessary chars.getPos(backup); c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x5B /*'['*/ && !literal) { if (mode == 1) { chars.setPos(backup); // backup setMode = 1; } else { // Handle opening '[' delimiter mode = 1; patLocal.append((UChar) 0x5B /*'['*/); chars.getPos(backup); // prepare to backup c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x5E /*'^'*/ && !literal) { invert = TRUE; patLocal.append((UChar) 0x5E /*'^'*/); chars.getPos(backup); // prepare to backup c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; } // Fall through to handle special leading '-'; // otherwise restart loop for nested [], \p{}, etc. if (c == HYPHEN /*'-'*/) { literal = TRUE; // Fall through to handle literal '-' below } else { chars.setPos(backup); // backup continue; } } } else if (symbols != 0) { const UnicodeFunctor *m = symbols->lookupMatcher(c); if (m != 0) { const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); if (ms == NULL) { ec = U_MALFORMED_SET; return; } // casting away const, but `nested' won't be modified // (important not to modify stored set) nested = const_cast<UnicodeSet*>(ms); setMode = 3; } } } // -------- Handle a nested set. This either is inline in // the pattern or represented by a stand-in that has // previously been parsed and was looked up in the symbol // table. if (setMode != 0) { if (lastItem == 1) { if (op != 0) { // syntaxError(chars, "Char expected after operator"); ec = U_MALFORMED_SET; return; } add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); lastItem = 0; op = 0; } if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { patLocal.append(op); } if (nested == 0) { // lazy allocation if (!scratch.allocate()) { ec = U_MEMORY_ALLOCATION_ERROR; return; } nested = scratch.pointer(); } switch (setMode) { case 1: nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); break; case 2: chars.skipIgnored(opts); nested->applyPropertyPattern(chars, patLocal, ec); if (U_FAILURE(ec)) return; break; case 3: // `nested' already parsed nested->_toPattern(patLocal, FALSE); break; } usePat = TRUE; if (mode == 0) { // Entire pattern is a category; leave parse loop *this = *nested; mode = 2; break; } switch (op) { case HYPHEN: /*'-'*/ removeAll(*nested); break; case INTERSECTION: /*'&'*/ retainAll(*nested); break; case 0: addAll(*nested); break; } op = 0; lastItem = 2; continue; } if (mode == 0) { // syntaxError(chars, "Missing '['"); ec = U_MALFORMED_SET; return; } // -------- Parse special (syntax) characters. If the // current character is not special, or if it is escaped, // then fall through and handle it below. if (!literal) { switch (c) { case 0x5D /*']'*/: if (lastItem == 1) { add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); } // Treat final trailing '-' as a literal if (op == HYPHEN /*'-'*/) { add(op, op); patLocal.append(op); } else if (op == INTERSECTION /*'&'*/) { // syntaxError(chars, "Trailing '&'"); ec = U_MALFORMED_SET; return; } patLocal.append((UChar) 0x5D /*']'*/); mode = 2; continue; case HYPHEN /*'-'*/: if (op == 0) { if (lastItem != 0) { op = (UChar) c; continue; } else { // Treat final trailing '-' as a literal add(c, c); c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x5D /*']'*/ && !literal) { patLocal.append(HYPHEN_RIGHT_BRACE, 2); mode = 2; continue; } } } // syntaxError(chars, "'-' not after char or set"); ec = U_MALFORMED_SET; return; case INTERSECTION /*'&'*/: if (lastItem == 2 && op == 0) { op = (UChar) c; continue; } // syntaxError(chars, "'&' not after set"); ec = U_MALFORMED_SET; return; case 0x5E /*'^'*/: // syntaxError(chars, "'^' not after '['"); ec = U_MALFORMED_SET; return; case 0x7B /*'{'*/: if (op != 0) { // syntaxError(chars, "Missing operand after operator"); ec = U_MALFORMED_SET; return; } if (lastItem == 1) { add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); } lastItem = 0; buf.truncate(0); { UBool ok = FALSE; while (!chars.atEnd()) { c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x7D /*'}'*/ && !literal) { ok = TRUE; break; } buf.append(c); } if (buf.length() < 1 || !ok) { // syntaxError(chars, "Invalid multicharacter string"); ec = U_MALFORMED_SET; return; } } // We have new string. Add it to set and continue; // we don't need to drop through to the further // processing add(buf); patLocal.append((UChar) 0x7B /*'{'*/); _appendToPat(patLocal, buf, FALSE); patLocal.append((UChar) 0x7D /*'}'*/); continue; case SymbolTable::SYMBOL_REF: // symbols nosymbols // [a-$] error error (ambiguous) // [a$] anchor anchor // [a-$x] var "x"* literal '$' // [a-$.] error literal '$' // *We won't get here in the case of var "x" { chars.getPos(backup); c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; UBool anchor = (c == 0x5D /*']'*/ && !literal); if (symbols == 0 && !anchor) { c = SymbolTable::SYMBOL_REF; chars.setPos(backup); break; // literal '$' } if (anchor && op == 0) { if (lastItem == 1) { add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); } add(U_ETHER); usePat = TRUE; patLocal.append((UChar) SymbolTable::SYMBOL_REF); patLocal.append((UChar) 0x5D /*']'*/); mode = 2; continue; } // syntaxError(chars, "Unquoted '$'"); ec = U_MALFORMED_SET; return; } default: break; } } // -------- Parse literal characters. This includes both // escaped chars ("\u4E01") and non-syntax characters // ("a"). switch (lastItem) { case 0: lastItem = 1; lastChar = c; break; case 1: if (op == HYPHEN /*'-'*/) { if (lastChar >= c) { // Don't allow redundant (a-a) or empty (b-a) ranges; // these are most likely typos. // syntaxError(chars, "Invalid range"); ec = U_MALFORMED_SET; return; } add(lastChar, c); _appendToPat(patLocal, lastChar, FALSE); patLocal.append(op); _appendToPat(patLocal, c, FALSE); lastItem = 0; op = 0; } else { add(lastChar, lastChar); _appendToPat(patLocal, lastChar, FALSE); lastChar = c; } break; case 2: if (op != 0) { // syntaxError(chars, "Set expected after operator"); ec = U_MALFORMED_SET; return; } lastChar = c; lastItem = 1; break; } } if (mode != 2) { // syntaxError(chars, "Missing ']'"); ec = U_MALFORMED_SET; return; } chars.skipIgnored(opts); /** * Handle global flags (invert, case insensitivity). If this * pattern should be compiled case-insensitive, then we need * to close over case BEFORE COMPLEMENTING. This makes * patterns like /[^abc]/i work. */ if ((options & USET_CASE_INSENSITIVE) != 0) { (this->*caseClosure)(USET_CASE_INSENSITIVE); } else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); } if (invert) { complement(); } // Use the rebuilt pattern (patLocal) only if necessary. Prefer the // generated pattern. if (usePat) { rebuiltPat.append(patLocal); } else { _generatePattern(rebuiltPat, FALSE); } if (isBogus() && U_SUCCESS(ec)) { // We likely ran out of memory. AHHH! ec = U_MEMORY_ALLOCATION_ERROR; } }
//---------------------------------------------------------------------------- // // main for genctd // //---------------------------------------------------------------------------- int main(int argc, char **argv) { UErrorCode status = U_ZERO_ERROR; const char *wordFileName; const char *outFileName; const char *outDir = NULL; const char *copyright = NULL; // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. // U_MAIN_INIT_ARGS(argc, argv); progName = argv[0]; argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); if(argc<0) { // Unrecognized option fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } if(options[0].doesOccur || options[1].doesOccur) { // -? or -h for help. usageAndDie(0); } if (!options[3].doesOccur || argc < 2) { fprintf(stderr, "input and output file must both be specified.\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } outFileName = options[3].value; wordFileName = argv[1]; if (options[4].doesOccur) { u_setDataDirectory(options[4].value); } status = U_ZERO_ERROR; /* Combine the directory with the file name */ if(options[5].doesOccur) { outDir = options[5].value; } if (options[6].doesOccur) { copyright = U_COPYRIGHT_STRING; } #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); fprintf(stderr, "%s\n", msg); /* write the dummy data file */ pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &status); return (int)status; #else /* Initialize ICU */ u_init(&status); if (U_FAILURE(status)) { fprintf(stderr, "%s: can not initialize ICU. status = %s\n", argv[0], u_errorName(status)); exit(1); } status = U_ZERO_ERROR; // // Read in the dictionary source file // long result; long wordFileSize; FILE *file; char *wordBufferC; MutableTrieDictionary *mtd = NULL; file = fopen(wordFileName, "rb"); if( file == 0 ) { //cannot find file //create 1-line dummy file: ie 1 char, 1 value UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); fprintf(stderr, "%s\n", msg); UChar c = 0x0020; mtd = new MutableTrieDictionary(c, status, TRUE); mtd->addWord(&c, 1, status, 1); } else { //read words in from input file fseek(file, 0, SEEK_END); wordFileSize = ftell(file); fseek(file, 0, SEEK_SET); wordBufferC = new char[wordFileSize+10]; result = (long)fread(wordBufferC, 1, wordFileSize, file); if (result != wordFileSize) { fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); exit (-1); } wordBufferC[wordFileSize]=0; fclose(file); // // Look for a Unicode Signature (BOM) on the word file // int32_t signatureLength; const char * wordSourceC = wordBufferC; const char* encoding = ucnv_detectUnicodeSignature( wordSourceC, wordFileSize, &signatureLength, &status); if (U_FAILURE(status)) { exit(status); } if(encoding!=NULL ){ wordSourceC += signatureLength; wordFileSize -= signatureLength; } // // Open a converter to take the rule file to UTF-16 // UConverter* conv; conv = ucnv_open(encoding, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // // Convert the words to UChar. // Preflight first to determine required buffer size. // uint32_t destCap = ucnv_toUChars(conv, NULL, // dest, 0, // destCapacity, wordSourceC, wordFileSize, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; status = U_ZERO_ERROR; UChar *wordSourceU = new UChar[destCap+1]; ucnv_toUChars(conv, wordSourceU, // dest, destCap+1, wordSourceC, wordFileSize, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; ucnv_close(conv); // Get rid of the original file buffer delete[] wordBufferC; // Create a MutableTrieDictionary, and loop through all the lines, inserting // words. // First, pick a median character. UChar *current = wordSourceU + (destCap/2); UChar uc = *current++; UnicodeSet breaks; breaks.add(0x000A); // Line Feed breaks.add(0x000D); // Carriage Return breaks.add(0x2028); // Line Separator breaks.add(0x2029); // Paragraph Separator do { // Look for line break while (uc && !breaks.contains(uc)) { uc = *current++; } // Now skip to first non-line-break while (uc && breaks.contains(uc)) { uc = *current++; } } while (uc && (breaks.contains(uc) || u_isspace(uc))); mtd = new MutableTrieDictionary(uc, status); if (U_FAILURE(status)) { fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Now add the words. Words are non-space characters at the beginning of // lines, and must be at least one UChar. If a word has an associated value, // the value should follow the word on the same line after a tab character. current = wordSourceU; UChar *candidate = current; uc = *current++; int32_t length = 0; int count = 0; while (uc) { while (uc && !u_isspace(uc)) { ++length; uc = *current++; } UnicodeString valueString; UChar candidateValue; if(uc == 0x0009){ //separator is a tab char, read in number after space while (uc && u_isspace(uc)) { uc = *current++; } while (uc && !u_isspace(uc)) { valueString.append(uc); uc = *current++; } } if (length > 0) { count++; if(valueString.length() > 0){ mtd->setValued(TRUE); uint32_t value = 0; char* s = new char[valueString.length()]; valueString.extract(0,valueString.length(), s, valueString.length()); int n = sscanf(s, "%ud", &value); U_ASSERT(n == 1); U_ASSERT(value >= 0); mtd->addWord(candidate, length, status, (uint16_t)value); delete[] s; } else { mtd->addWord(candidate, length, status); } if (U_FAILURE(status)) { fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", u_errorName(status), count); exit(status); } } // Find beginning of next line while (uc && !breaks.contains(uc)) { uc = *current++; } // Find next non-line-breaking character while (uc && breaks.contains(uc)) { uc = *current++; } candidate = current-1; length = 0; } // Get rid of the Unicode text buffer delete[] wordSourceU; } // Now, create a CompactTrieDictionary from the mutable dictionary CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); if (U_FAILURE(status)) { fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Get rid of the MutableTrieDictionary delete mtd; // // Get the binary data from the dictionary. // uint32_t outDataSize = ctd->dataSize(); const uint8_t *outData = (const uint8_t *)ctd->data(); // // Create the output file // size_t bytesWritten; UNewDataMemory *pData; pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", outFileName, u_errorName(status)); exit(status); } // Write the data itself. udata_writeBlock(pData, outData, outDataSize); // finish up bytesWritten = udata_finish(pData, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status)); exit(status); } if (bytesWritten != outDataSize) { fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); exit(-1); } // Get rid of the CompactTrieDictionary delete ctd; u_cleanup(); printf("genctd: tool completed successfully.\n"); return 0; #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ }
//--------------------------------------------------------------------- // // dump Output the compiled form of the pattern. // Debugging function only. // //--------------------------------------------------------------------- void RegexPattern::dumpOp(int32_t index) const { (void)index; // Suppress warnings in non-debug build. #if defined(REGEX_DEBUG) static const char * const opNames[] = {URX_OPCODE_NAMES}; int32_t op = fCompiledPat->elementAti(index); int32_t val = URX_VAL(op); int32_t type = URX_TYPE(op); int32_t pinnedType = type; if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { pinnedType = 0; } printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); switch (type) { case URX_NOP: case URX_DOTANY: case URX_DOTANY_ALL: case URX_FAIL: case URX_CARET: case URX_DOLLAR: case URX_BACKSLASH_G: case URX_BACKSLASH_X: case URX_END: case URX_DOLLAR_M: case URX_CARET_M: // Types with no operand field of interest. break; case URX_RESERVED_OP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_STATE_SAVE: case URX_JMP: case URX_JMP_SAV: case URX_JMP_SAV_X: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: case URX_BACKSLASH_D: case URX_BACKSLASH_Z: case URX_STRING_LEN: case URX_CTR_INIT: case URX_CTR_INIT_NG: case URX_CTR_LOOP: case URX_CTR_LOOP_NG: case URX_RELOC_OPRND: case URX_STO_SP: case URX_LD_SP: case URX_BACKREF: case URX_STO_INP_LOC: case URX_JMPX: case URX_LA_START: case URX_LA_END: case URX_BACKREF_I: case URX_LB_START: case URX_LB_CONT: case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: case URX_LOOP_C: case URX_LOOP_DOT_I: case URX_BACKSLASH_H: case URX_BACKSLASH_R: case URX_BACKSLASH_V: // types with an integer operand field. printf("%d", val); break; case URX_ONECHAR: case URX_ONECHAR_I: printf("%c", val<256?val:'?'); break; case URX_STRING: case URX_STRING_I: { int32_t lengthOp = fCompiledPat->elementAti(index+1); U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); int32_t length = URX_VAL(lengthOp); int32_t i; for (i=val; i<val+length; i++) { UChar c = fLiteralText[i]; if (c < 32 || c >= 256) {c = '.';} printf("%c", c); } } break; case URX_SETREF: case URX_LOOP_SR_I: { UnicodeString s; UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); set->toPattern(s, TRUE); for (int32_t i=0; i<s.length(); i++) { printf("%c", s.charAt(i)); } } break; case URX_STATIC_SETREF: case URX_STAT_SETREF_N: { UnicodeString s; if (val & URX_NEG_SET) { printf("NOT "); val &= ~URX_NEG_SET; } UnicodeSet *set = fStaticSets[val]; set->toPattern(s, TRUE); for (int32_t i=0; i<s.length(); i++) { printf("%c", s.charAt(i)); } } break; default: printf("??????"); break; } printf("\n"); #endif }
// // APITest. Invoke every function at least once, and check that it does something. // Does not attempt to check complete functionality. // void AlphabeticIndexTest::APITest() { // // Simple constructor and destructor, getBucketCount() // UErrorCode status = U_ZERO_ERROR; int32_t lc = 0; int32_t i = 0; AlphabeticIndex *index = new AlphabeticIndex(Locale::getEnglish(), status); TEST_CHECK_STATUS; lc = index->getBucketCount(status); TEST_CHECK_STATUS; TEST_ASSERT(28 == lc); // 26 letters plus two under/overflow labels. //printf("getBucketCount() == %d\n", lc); delete index; // Constructor from a Collator // status = U_ZERO_ERROR; RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>( Collator::createInstance(Locale::getGerman(), status)); TEST_CHECK_STATUS; TEST_ASSERT(coll != NULL); index = new AlphabeticIndex(coll, status); TEST_CHECK_STATUS; TEST_ASSERT(coll == &index->getCollator()); assertEquals("only the underflow label in an index built from a collator", 1, index->getBucketCount(status)); TEST_CHECK_STATUS; delete index; // addLabels() status = U_ZERO_ERROR; index = new AlphabeticIndex(Locale::getEnglish(), status); TEST_CHECK_STATUS; UnicodeSet additions; additions.add((UChar32)0x410).add((UChar32)0x415); // A couple of Cyrillic letters index->addLabels(additions, status); TEST_CHECK_STATUS; lc = index->getBucketCount(status); TEST_CHECK_STATUS; assertEquals("underflow, A-Z, inflow, 2 Cyrillic, overflow", 31, index->getBucketCount(status)); // std::cout << lc << std::endl; delete index; // addLabels(Locale) status = U_ZERO_ERROR; index = new AlphabeticIndex(Locale::getEnglish(), status); TEST_CHECK_STATUS; AlphabeticIndex &aip = index->addLabels(Locale::getJapanese(), status); TEST_ASSERT(&aip == index); TEST_CHECK_STATUS; lc = index->getBucketCount(status); TEST_CHECK_STATUS; TEST_ASSERT(35 < lc); // Japanese should add a bunch. Don't rely on the exact value. delete index; // GetCollator(), Get under/in/over flow labels status = U_ZERO_ERROR; index = new AlphabeticIndex(Locale::getGerman(), status); TEST_CHECK_STATUS; Collator *germanCol = Collator::createInstance(Locale::getGerman(), status); TEST_CHECK_STATUS; const RuleBasedCollator &indexCol = index->getCollator(); TEST_ASSERT(*germanCol == indexCol); delete germanCol; UnicodeString ELLIPSIS; ELLIPSIS.append((UChar32)0x2026); UnicodeString s = index->getUnderflowLabel(); TEST_ASSERT(ELLIPSIS == s); s = index->getOverflowLabel(); TEST_ASSERT(ELLIPSIS == s); s = index->getInflowLabel(); TEST_ASSERT(ELLIPSIS == s); index->setOverflowLabel(UNICODE_STRING_SIMPLE("O"), status); index->setUnderflowLabel(UNICODE_STRING_SIMPLE("U"), status).setInflowLabel(UNICODE_STRING_SIMPLE("I"), status); s = index->getUnderflowLabel(); TEST_ASSERT(UNICODE_STRING_SIMPLE("U") == s); s = index->getOverflowLabel(); TEST_ASSERT(UNICODE_STRING_SIMPLE("O") == s); s = index->getInflowLabel(); TEST_ASSERT(UNICODE_STRING_SIMPLE("I") == s); delete index; const UnicodeString adam = UNICODE_STRING_SIMPLE("Adam"); const UnicodeString baker = UNICODE_STRING_SIMPLE("Baker"); const UnicodeString charlie = UNICODE_STRING_SIMPLE("Charlie"); const UnicodeString chad = UNICODE_STRING_SIMPLE("Chad"); const UnicodeString zed = UNICODE_STRING_SIMPLE("Zed"); const UnicodeString Cyrillic = UNICODE_STRING_SIMPLE("\\u0410\\u0443\\u0435").unescape(); // addRecord(), verify that it comes back out. // status = U_ZERO_ERROR; index = new AlphabeticIndex(Locale::getEnglish(), status); TEST_CHECK_STATUS; index->addRecord(UnicodeString("Adam"), this, status); UBool b; TEST_CHECK_STATUS; index->resetBucketIterator(status); TEST_CHECK_STATUS; index->nextBucket(status); // Move to underflow label index->nextBucket(status); // Move to "A" TEST_CHECK_STATUS; const UnicodeString &label2 = index->getBucketLabel(); UnicodeString A_STR = UNICODE_STRING_SIMPLE("A"); TEST_ASSERT(A_STR == label2); b = index->nextRecord(status); TEST_CHECK_STATUS; TEST_ASSERT(b); const UnicodeString &itemName = index->getRecordName(); TEST_ASSERT(adam == itemName); const void *itemContext = index->getRecordData(); TEST_ASSERT(itemContext == this); delete index; // clearRecords, addRecord(), Iteration status = U_ZERO_ERROR; index = new AlphabeticIndex(Locale::getEnglish(), status); TEST_CHECK_STATUS; while (index->nextBucket(status)) { TEST_CHECK_STATUS; while (index->nextRecord(status)) { TEST_CHECK_STATUS; TEST_ASSERT(FALSE); // No items have been added. } TEST_CHECK_STATUS; } index->addRecord(adam, NULL, status); index->addRecord(baker, NULL, status); index->addRecord(charlie, NULL, status); index->addRecord(chad, NULL, status); TEST_CHECK_STATUS; int itemCount = 0; index->resetBucketIterator(status); while (index->nextBucket(status)) { TEST_CHECK_STATUS; while (index->nextRecord(status)) { TEST_CHECK_STATUS; ++itemCount; } } TEST_CHECK_STATUS; TEST_ASSERT(itemCount == 4); TEST_ASSERT(index->nextBucket(status) == FALSE); index->resetBucketIterator(status); TEST_CHECK_STATUS; TEST_ASSERT(index->nextBucket(status) == TRUE); index->clearRecords(status); TEST_CHECK_STATUS; index->resetBucketIterator(status); while (index->nextBucket(status)) { TEST_CHECK_STATUS; while (index->nextRecord(status)) { TEST_ASSERT(FALSE); // No items have been added. } } TEST_CHECK_STATUS; delete index; // getBucketLabel(), getBucketType() status = U_ZERO_ERROR; index = new AlphabeticIndex(Locale::getEnglish(), status); TEST_CHECK_STATUS; index->setUnderflowLabel(adam, status).setOverflowLabel(charlie, status); TEST_CHECK_STATUS; for (i=0; index->nextBucket(status); i++) { TEST_CHECK_STATUS; UnicodeString label = index->getBucketLabel(); UAlphabeticIndexLabelType type = index->getBucketLabelType(); if (i == 0) { TEST_ASSERT(type == U_ALPHAINDEX_UNDERFLOW); TEST_ASSERT(label == adam); } else if (i <= 26) { // Labels A - Z for English locale TEST_ASSERT(type == U_ALPHAINDEX_NORMAL); UnicodeString expectedLabel((UChar)(0x40 + i)); TEST_ASSERT(expectedLabel == label); } else if (i == 27) { TEST_ASSERT(type == U_ALPHAINDEX_OVERFLOW); TEST_ASSERT(label == charlie); } else { TEST_ASSERT(FALSE); } } TEST_ASSERT(i==28); delete index; // getBucketIndex() status = U_ZERO_ERROR; index = new AlphabeticIndex(Locale::getEnglish(), status); TEST_CHECK_STATUS; int32_t n = index->getBucketIndex(adam, status); TEST_CHECK_STATUS; TEST_ASSERT(n == 1); /* Label #0 is underflow, 1 is A, etc. */ n = index->getBucketIndex(baker, status); TEST_ASSERT(n == 2); n = index->getBucketIndex(Cyrillic, status); TEST_ASSERT(n == 27); // Overflow label n = index->getBucketIndex(zed, status); TEST_ASSERT(n == 26); for (i=0; index->nextBucket(status); i++) { n = index->getBucketIndex(); TEST_ASSERT(n == i); UnicodeString label = index->getBucketLabel(); TEST_ASSERT(n == i); } TEST_ASSERT(i == 28); delete index; index = new AlphabeticIndex(Locale::createFromName("ru"), status); TEST_CHECK_STATUS; assertEquals("Russian index.getBucketCount()", 32, index->getBucketCount(status)); // Latin-script names should go into the underflow label (0) // if the Russian collation does not use script reordering, // but into the overflow label (getBucketCount()-1) // if Russian sorts Cyrillic first. int32_t reorderCodes[20]; int32_t expectedLatinIndex = 0; if (index->getCollator().getReorderCodes(reorderCodes, LENGTHOF(reorderCodes), status) > 0) { expectedLatinIndex = index->getBucketCount(status) - 1; } n = index->getBucketIndex(adam, status); TEST_CHECK_STATUS; assertEquals("Russian index.getBucketIndex(adam)", expectedLatinIndex, n); n = index->getBucketIndex(baker, status); assertEquals("Russian index.getBucketIndex(baker)", expectedLatinIndex, n); n = index->getBucketIndex(Cyrillic, status); assertEquals("Russian index.getBucketIndex(Cyrillic)", 1, n); n = index->getBucketIndex(zed, status); assertEquals("Russian index.getBucketIndex(zed)", expectedLatinIndex, n); delete index; }
void CanonicalIteratorTest::TestBasic() { UErrorCode status = U_ZERO_ERROR; static const char * const testArray[][2] = { {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, " "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, " "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, " "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"}, {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"}, {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"}, }; #if 0 // This is not interesting for C/C++ as the data is already built beforehand // check build UnicodeSet ss = CanonicalIterator.getSafeStart(); logln("Safe Start: " + ss.toPattern(true)); ss = CanonicalIterator.getStarts('a'); expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'), new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB" + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]") ); #endif // check permute // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted! Hashtable *permutations = new Hashtable(FALSE, status); permutations->setValueDeleter(uhash_deleteUnicodeString); UnicodeString toPermute("ABC"); CanonicalIterator::permute(toPermute, FALSE, permutations, status); logln("testing permutation"); expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA"); delete permutations; // try samples logln("testing samples"); Hashtable *set = new Hashtable(FALSE, status); set->setValueDeleter(uhash_deleteUnicodeString); int32_t i = 0; CanonicalIterator it("", status); if(U_SUCCESS(status)) { for (i = 0; i < ARRAY_LENGTH(testArray); ++i) { //logln("Results for: " + name.transliterate(testArray[i])); UnicodeString testStr = CharsToUnicodeString(testArray[i][0]); it.setSource(testStr, status); set->removeAll(); for (;;) { //UnicodeString *result = new UnicodeString(it.next()); UnicodeString result(it.next()); if (result.isBogus()) { break; } set->put(result, new UnicodeString(result), status); // Add result to the table //logln(++counter + ": " + hex.transliterate(result)); //logln(" = " + name.transliterate(result)); } expectEqual(i + ": ", testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1])); } } else { errln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status)); } delete set; }
/* * Find missing case mapping relationships and add mappings for case closure. * This function starts from an "original" code point and recursively * finds its case mappings and the case mappings of where it maps to. * * The recursion depth is capped at 3 nested calls of this function. * In each call, the current code point is c, and the function enumerates * all of c's simple (single-code point) case mappings. * prev is the code point that case-mapped to c. * prev2 is the code point that case-mapped to prev. * * The initial function call has prev2<0, prev<0, and c==orig * (marking no code points). * It enumerates c's case mappings and recurses without further action. * * The second-level function call has prev2<0, prev==orig, and c is * the destination code point of one of prev's case mappings. * The function checks if any of c's case mappings go back to orig * and adds a closure mapping if not. * In other words, it turns a case mapping relationship of * orig->c * into * orig<->c * * The third-level function call has prev2==orig, prev>=0, and c is * the destination code point of one of prev's case mappings. * (And prev is the destination of one of prev2's case mappings.) * The function checks if any of c's case mappings go back to orig * and adds a closure mapping if not. * In other words, it turns case mapping relationships of * orig->prev->c or orig->prev<->c * into * orig->prev->c->orig or orig->prev<->c->orig * etc. * (Graphically, this closes a triangle.) * * With repeated application on all code points until no more closure mappings * are added, all case equivalence groups get complete mappings. * That is, in each group of code points with case relationships * each code point will in the end have some mapping to each other * code point in the group. * * @return TRUE if a closure mapping was added */ UBool CasePropsBuilder::addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return FALSE; } UChar32 next; UBool someMappingsAdded=FALSE; if(c!=orig) { /* get the properties for c */ value=utrie2_get32(pTrie, c); } /* else if c==orig then c's value was passed in */ if(value&UCASE_EXCEPTION) { UnicodeSet set; ExcProps &ep=*excProps[value>>UGENCASE_EXC_SHIFT]; UniProps &p=ep.props; /* * marker for whether any of c's mappings goes to orig * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings */ UBool mapsToOrig=(UBool)(c==orig); /* collect c's case mapping destinations in set[] */ if((next=p.suc)>=0 && next!=c) { set.add(next); } if((next=p.slc)>=0 && next!=c) { set.add(next); } if(p.suc!=(next=p.stc) && next!=c) { set.add(next); } if((next=p.scf)>=0 && next!=c) { set.add(next); } /* add c's current closure mappings to set */ set.addAll(ep.closure); /* process all code points to which c case-maps */ UnicodeSetIterator iter(set); while(iter.next()) { next=iter.getCodepoint(); /* next!=c */ if(next==orig) { mapsToOrig=TRUE; /* remember that we map to orig */ } else if(prev2<0 && next!=prev) { /* * recurse unless * we have reached maximum depth (prev2>=0) or * this is a mapping to one of the previous code points (orig, prev, c) */ someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode); } } if(!mapsToOrig) { addClosureMapping(c, orig, errorCode); return TRUE; } } else {
void CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) { if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; } UChar32 start=props.start; UChar32 end=props.end; /* default: map to self */ int32_t delta=0; uint32_t type; if(props.binProps[UCHAR_LOWERCASE]) { type=UCASE_LOWER; } else if(props.binProps[UCHAR_UPPERCASE]) { type=UCASE_UPPER; } else if(props.getIntProp(UCHAR_GENERAL_CATEGORY)==U_TITLECASE_LETTER) { type=UCASE_TITLE; } else { type=UCASE_NONE; } uint32_t value=type; UBool hasMapping=FALSE; if(props.suc>=0) { /* uppercase mapping as delta if the character is lowercase */ hasMapping=TRUE; if(type==UCASE_LOWER) { delta=props.suc-start; } else { value|=UCASE_EXCEPTION; } } if(props.slc>=0) { /* lowercase mapping as delta if the character is uppercase or titlecase */ hasMapping=TRUE; if(type>=UCASE_UPPER) { delta=props.slc-start; } else { value|=UCASE_EXCEPTION; } } if(props.stc>=0) { hasMapping=TRUE; } if(props.suc!=props.stc) { value|=UCASE_EXCEPTION; } if(!props.lc.isEmpty() || !props.uc.isEmpty() || !props.tc.isEmpty() || newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS) ) { hasMapping=TRUE; value|=UCASE_EXCEPTION; } if( (props.scf>=0 && props.scf!=props.slc) || (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) || newValues.contains(PPUCD_TURKIC_CASE_FOLDING) ) { hasMapping=TRUE; value|=UCASE_EXCEPTION; } // Simple case folding falls back to simple lowercasing. // If there is no case folding but there is a lowercase mapping, // then add a case folding mapping to the code point. // For example: Cherokee uppercase syllables since Unicode 8. // (Full case folding falls back to simple case folding, // not to full lowercasing, so we need not also handle it specially // for such cases.) UChar32 scf=props.scf; if(scf<0 && props.slc>=0) { scf=start; hasMapping=TRUE; value|=UCASE_EXCEPTION; } if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) { value|=UCASE_EXCEPTION; } if(props.binProps[UCHAR_SOFT_DOTTED]) { value|=UCASE_SOFT_DOTTED; } int32_t cc=props.getIntProp(UCHAR_CANONICAL_COMBINING_CLASS); if(cc!=0) { if(props.binProps[UCHAR_SOFT_DOTTED]) { fprintf(stderr, "genprops error: a soft-dotted character has ccc!=0\n"); errorCode=U_ILLEGAL_ARGUMENT_ERROR; return; } if(cc==230) { value|=UCASE_ABOVE; } else { value|=UCASE_OTHER_ACCENT; } } if(props.binProps[UCHAR_CASE_IGNORABLE]) { value|=UCASE_IGNORABLE; } if((hasMapping || (value&UCASE_EXCEPTION)) && start!=end) { fprintf(stderr, "genprops error: range %04lX..%04lX has case mappings " "or reasons for data structure exceptions\n", (long)start, (long)end); errorCode=U_ILLEGAL_ARGUMENT_ERROR; return; } /* handle exceptions */ if(value&UCASE_EXCEPTION) { /* simply store exceptions for later processing and encoding */ if(excPropsCount==MAX_EXC_COUNT) { fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n"); errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return; } ExcProps *newExcProps=new ExcProps(props); if(newExcProps==NULL) { fprintf(stderr, "genprops error: casepropsbuilder out of memory allocating " "exceptions properties\n"); errorCode=U_MEMORY_ALLOCATION_ERROR; return; } newExcProps->props.scf=scf; newExcProps->hasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS); newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING); value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT; excProps[excPropsCount++]=newExcProps; } else { /* store the simple case mapping delta */ value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK; } utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops error: unable to set case mapping values: %s\n", u_errorName(errorCode)); return; } if(hasMapping) { /* update the case-sensitive set */ caseSensitive.add(start); if(scf>=0) { caseSensitive.add(scf); } if(props.slc>=0) { caseSensitive.add(props.slc); } if(props.suc>=0) { caseSensitive.add(props.suc); } if(props.stc>=0) { caseSensitive.add(props.stc); } caseSensitive.addAll(props.cf); caseSensitive.addAll(props.lc); caseSensitive.addAll(props.uc); caseSensitive.addAll(props.tc); /* update maxFullLength */ if(props.cf.length()>maxFullLength) { maxFullLength=props.cf.length(); } if(props.lc.length()>maxFullLength) { maxFullLength=props.lc.length(); } if(props.uc.length()>maxFullLength) { maxFullLength=props.uc.length(); } if(props.tc.length()>maxFullLength) { maxFullLength=props.tc.length(); } } /* add the multi-character case folding to the "unfold" data */ if(props.cf.hasMoreChar32Than(0, 0x7fffffff, 1)) { addUnfolding(start, props.cf, errorCode); } }
U_CAPI int32_t U_EXPORT2 uspoof_check(const USpoofChecker *sc, const UChar *text, int32_t length, int32_t *position, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (This == NULL) { return 0; } if (length < -1) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } if (length == -1) { // It's not worth the bother to handle nul terminated strings everywhere. // Just get the length and be done with it. length = u_strlen(text); } int32_t result = 0; int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32? // A count of the number of non-Common or inherited scripts. // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests. // Share the computation when possible. scriptCount == -1 means that we haven't // done it yet. int32_t scriptCount = -1; if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) { scriptCount = This->scriptScan(text, length, failPos, *status); // printf("scriptCount (clipped to 2) = %d\n", scriptCount); if ( scriptCount >= 2) { // Note: scriptCount == 2 covers all cases of the number of scripts >= 2 result |= USPOOF_SINGLE_SCRIPT; } } if (This->fChecks & USPOOF_CHAR_LIMIT) { int32_t i; UChar32 c; for (i=0; i<length ;) { U16_NEXT(text, i, length, c); if (!This->fAllowedCharsSet->contains(c)) { result |= USPOOF_CHAR_LIMIT; if (i < failPos) { failPos = i; } break; } } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { // These are the checks that need to be done on NFD input NFDBuffer normalizedInput(text, length, *status); const UChar *nfdText = normalizedInput.getBuffer(); int32_t nfdLength = normalizedInput.getLength(); if (This->fChecks & USPOOF_INVISIBLE) { // scan for more than one occurence of the same non-spacing mark // in a sequence of non-spacing marks. int32_t i; UChar32 c; UChar32 firstNonspacingMark = 0; UBool haveMultipleMarks = FALSE; UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; i<nfdLength ;) { U16_NEXT(nfdText, i, nfdLength, c); if (u_charType(c) != U_NON_SPACING_MARK) { firstNonspacingMark = 0; if (haveMultipleMarks) { marksSeenSoFar.clear(); haveMultipleMarks = FALSE; } continue; } if (firstNonspacingMark == 0) { firstNonspacingMark = c; continue; } if (!haveMultipleMarks) { marksSeenSoFar.add(firstNonspacingMark); haveMultipleMarks = TRUE; } if (marksSeenSoFar.contains(c)) { // report the error, and stop scanning. // No need to find more than the first failure. result |= USPOOF_INVISIBLE; failPos = i; // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want // to give back to our caller is a position in the original input string. if (failPos > length) { failPos = length; } break; } marksSeenSoFar.add(c); } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { // The basic test is the same for both whole and mixed script confusables. // Compute the set of scripts that every input character has a confusable in. // For this computation an input character is always considered to be // confusable with itself in its own script. // If the number of such scripts is two or more, and the input consisted of // characters all from a single script, we have a whole script confusable. // (The two scripts will be the original script and the one that is confusable) // If the number of such scripts >= one, and the original input contained characters from // more than one script, we have a mixed script confusable. (We can transform // some of the characters, and end up with a visually similar string all in // one script.) if (scriptCount == -1) { int32_t t; scriptCount = This->scriptScan(text, length, t, *status); } ScriptSet scripts; This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && confusableScriptCount >= 2 && scriptCount == 1) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && confusableScriptCount >= 1 && scriptCount > 1) { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; } } } if (position != NULL && failPos != 0x7fffffff) { *position = failPos; } return result; }
/* Try to have the compiler inline these*/ inline static int32_t getStringCount(const UnicodeSet& set) { return set.getStringCount(); }
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { if (U_FAILURE(status)) { return; } // Chinese index characters, which are specific to each of the several Chinese tailorings, // take precedence over the single locale data exemplar set per language. const char *language = locale.getLanguage(); if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || uprv_strcmp(language, "ko") == 0) { // TODO: This should be done regardless of the language, but it's expensive. // We should add a Collator function (can be @internal) // to enumerate just the contractions that start with a given code point or string. if (addChineseIndexCharacters(status) || U_FAILURE(status)) { return; } } LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); if (U_FAILURE(status)) { return; } UnicodeSet exemplars; ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); if (U_SUCCESS(status)) { initialLabels_->addAll(exemplars); return; } status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR // The locale data did not include explicit Index characters. // Synthesize a set of them from the locale's standard exemplar characters. ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); if (U_FAILURE(status)) { return; } // question: should we add auxiliary exemplars? if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { exemplars.add(0x61, 0x7A); } if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables // cut down to small list exemplars.remove(0xAC00, 0xD7A3). add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). add(0xD30C).add(0xD558); } if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block // cut down to small list // make use of the fact that Ethiopic is allocated in 8's, where // the base is 0 mod 8. UnicodeSet ethiopic( UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); UnicodeSetIterator it(ethiopic); while (it.next() && !it.isString()) { if ((it.getCodepoint() & 0x7) != 0) { exemplars.remove(it.getCodepoint()); } } } // Upper-case any that aren't already so. // (We only do this for synthesized index characters.) UnicodeSetIterator it(exemplars); UnicodeString upperC; while (it.next()) { const UnicodeString &exemplarC = it.getString(); upperC = exemplarC; upperC.toUpper(locale); initialLabels_->add(upperC); } }
//--------------------------------------------------------------------------------- // // Parse RBBI rules. The state machine for rules parsing is here. // The state tables are hand-written in the file rbbirpt.txt, // and converted to the form used here by a perl // script rbbicst.pl // //--------------------------------------------------------------------------------- void RBBIRuleScanner::parse() { uint16_t state; const RBBIRuleTableEl *tableEl; if (U_FAILURE(*fRB->fStatus)) { return; } state = 1; nextChar(fC); // // Main loop for the rule parsing state machine. // Runs once per state transition. // Each time through optionally performs, depending on the state table, // - an advance to the the next input char // - an action to be performed. // - pushing or popping a state to/from the local state return stack. // for (;;) { // Bail out if anything has gone wrong. // RBBI rule file parsing stops on the first error encountered. if (U_FAILURE(*fRB->fStatus)) { break; } // Quit if state == 0. This is the normal way to exit the state machine. // if (state == 0) { break; } // Find the state table element that matches the input char from the rule, or the // class of the input character. Start with the first table row for this // state, then linearly scan forward until we find a row that matches the // character. The last row for each state always matches all characters, so // the search will stop there, if not before. // tableEl = &gRuleParseStateTable[state]; #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); } #endif for (;;) { #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); } #endif if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) { // Table row specified an individual character, not a set, and // the input character is not escaped, and // the input character matched it. break; } if (tableEl->fCharClass == 255) { // Table row specified default, match anything character class. break; } if (tableEl->fCharClass == 254 && fC.fEscaped) { // Table row specified "escaped" and the char was escaped. break; } if (tableEl->fCharClass == 253 && fC.fEscaped && (fC.fChar == 0x50 || fC.fChar == 0x70 )) { // Table row specified "escaped P" and the char is either 'p' or 'P'. break; } if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { // Table row specified eof and we hit eof on the input. break; } if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && fC.fEscaped == FALSE && // char is not escaped && fC.fChar != (UChar32)-1) { // char is not EOF UnicodeSet *uniset = fRuleSets[tableEl->fCharClass-128]; if (uniset->contains(fC.fChar)) { // Table row specified a character class, or set of characters, // and the current char matches it. break; } } // No match on this row, advance to the next row for this state, tableEl++; } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts(""); } // // We've found the row of the state table that matches the current input // character from the rules string. // Perform any action specified by this row in the state table. if (doParseActions((EParseAction)tableEl->fAction) == FALSE) { // Break out of the state machine loop if the // the action signalled some kind of error, or // the action was to exit, occurs on normal end-of-rules-input. break; } if (tableEl->fPushState != 0) { fStackPtr++; if (fStackPtr >= kStackSize) { error(U_BRK_INTERNAL_ERROR); RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); fStackPtr--; } fStack[fStackPtr] = tableEl->fPushState; } if (tableEl->fNextChar) { nextChar(fC); } // Get the next state from the table entry, or from the // state stack if the next state was specified as "pop". if (tableEl->fNextState != 255) { state = tableEl->fNextState; } else { state = fStack[fStackPtr]; fStackPtr--; if (fStackPtr < 0) { error(U_BRK_INTERNAL_ERROR); RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); fStackPtr++; } } } // // If there were NO user specified reverse rules, set up the equivalent of ".*;" // if (fRB->fReverseTree == NULL) { fRB->fReverseTree = pushNewNode(RBBINode::opStar); RBBINode *operand = pushNewNode(RBBINode::setRef); findSetFor(kAny, operand); fRB->fReverseTree->fLeftChild = operand; operand->fParent = fRB->fReverseTree; fNodeStackPtr -= 2; } // // Parsing of the input RBBI rules is complete. // We now have a parse tree for the rule expressions // and a list of all UnicodeSets that are referenced. // #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) { fSymbolTable->rbbiSymtablePrint(); } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) { RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); fRB->fForwardTree->printTree(TRUE); RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); fRB->fReverseTree->printTree(TRUE); RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); fRB->fSafeFwdTree->printTree(TRUE); RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); fRB->fSafeRevTree->printTree(TRUE); } #endif }
//------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // //------------------------------------------------------------------------ void RBBISetBuilder::build() { RBBINode *usetNode; RangeDescriptor *rlRange; if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} // // Initialize the process by creating a single range encompassing all characters // that is in no sets. // fRangeList = new RangeDescriptor(*fStatus); // will check for status here fRangeList->fStartChar = 0; fRangeList->fEndChar = 0x10ffff; if (U_FAILURE(*fStatus)) { return; } // // Find the set of non-overlapping ranges of characters // int ni; for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; int32_t inputSetRangeCount = inputSet->getRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange->fEndChar < inputSetRangeBegin) { rlRange = rlRange->fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range list range // in two, with one part being before (wholly outside of) the unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will then be skipped // over if (rlRange->fStartChar < inputSetRangeBegin) { rlRange->split(inputSetRangeBegin, *fStatus); if (U_FAILURE(*fStatus)) { return; } continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide with // the end of the range from the unicode set, split the range list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange->fEndChar > inputSetRangeEnd) { rlRange->split(inputSetRangeEnd+1, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // The current rlRange is now entirely within the UnicodeSet range. // Add this unicode set to the list of sets for this rlRange if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { rlRange->fIncludesSets->addElement(usetNode, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange->fEndChar) { inputSetRangeIndex++; } rlRange = rlRange->fNext; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-in-input // # 3 is the first range list. // RangeDescriptor *rlSearchRange; for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { rlRange->fNum = rlSearchRange->fNum; break; } } if (rlRange->fNum == 0) { fGroupCount ++; rlRange->fNum = fGroupCount+2; rlRange->setDictionaryFlag(); addValToSets(rlRange->fIncludesSets, fGroupCount+2); } } // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not a characters in the normal sense, // they doesn't affect the computation of ranges or TRIE. static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; UnicodeString eofString(eofUString); UnicodeString bofString(bofUString); for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; if (inputSet->contains(eofString)) { addValToSet(usetNode, 1); } if (inputSet->contains(bofString)) { addValToSet(usetNode, 2); fSawBOF = TRUE; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} // // Build the Trie table for mapping UChar32 values to the corresponding // range group number // fTrie = utrie_open(NULL, // Pre-existing trie to be filled in NULL, // Data array (utrie will allocate one) 100000, // Max Data Length 0, // Initial value for all code points 0, // Lead surrogate unit value TRUE); // Keep Latin 1 in separately for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); } }
// Use LocalXyzPointer types that are not covered elsewhere in the intltest suite. void LocalPointerTest::TestLocalXyzPointer() { IcuTestErrorCode errorCode(*this, "TestLocalXyzPointer"); static const char *const encoding="ISO-8859-1"; LocalUConverterSelectorPointer sel( ucnvsel_open(&encoding, 1, NULL, UCNV_ROUNDTRIP_SET, errorCode)); if(errorCode.logIfFailureAndReset("ucnvsel_open()")) { return; } if(sel.isNull()) { errln("LocalUConverterSelectorPointer failure"); return; } #if !UCONFIG_NO_FORMATTING LocalUCalendarPointer cal(ucal_open(NULL, 0, "root", UCAL_GREGORIAN, errorCode)); if(errorCode.logDataIfFailureAndReset("ucal_open()")) { return; } if(cal.isNull()) { errln("LocalUCalendarPointer failure"); return; } LocalUDateTimePatternGeneratorPointer patgen(udatpg_open("root", errorCode)); if(errorCode.logDataIfFailureAndReset("udatpg_open()")) { return; } if(patgen.isNull()) { errln("LocalUDateTimePatternGeneratorPointer failure"); return; } LocalULocaleDisplayNamesPointer ldn(uldn_open("de-CH", ULDN_STANDARD_NAMES, errorCode)); if(errorCode.logIfFailureAndReset("uldn_open()")) { return; } if(ldn.isNull()) { errln("LocalULocaleDisplayNamesPointer failure"); return; } UnicodeString hello=UNICODE_STRING_SIMPLE("Hello {0}!"); LocalUMessageFormatPointer msg( umsg_open(hello.getBuffer(), hello.length(), "root", NULL, errorCode)); if(errorCode.logIfFailureAndReset("umsg_open()")) { return; } if(msg.isNull()) { errln("LocalUMessageFormatPointer failure"); return; } #endif /* UCONFIG_NO_FORMATTING */ #if !UCONFIG_NO_NORMALIZATION const UNormalizer2 *nfc=unorm2_getNFCInstance(errorCode); UnicodeSet emptySet; LocalUNormalizer2Pointer fn2(unorm2_openFiltered(nfc, emptySet.toUSet(), errorCode)); if(errorCode.logIfFailureAndReset("unorm2_openFiltered()")) { return; } if(fn2.isNull()) { errln("LocalUNormalizer2Pointer failure"); return; } #endif /* !UCONFIG_NO_NORMALIZATION */ #if !UCONFIG_NO_IDNA LocalUIDNAPointer idna(uidna_openUTS46(0, errorCode)); if(errorCode.logIfFailureAndReset("uidna_openUTS46()")) { return; } if(idna.isNull()) { errln("LocalUIDNAPointer failure"); return; } #endif /* !UCONFIG_NO_IDNA */ #if !UCONFIG_NO_REGULAR_EXPRESSIONS UnicodeString pattern=UNICODE_STRING_SIMPLE("abc|xy+z"); LocalURegularExpressionPointer regex( uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, errorCode)); if(errorCode.logIfFailureAndReset("uregex_open()")) { return; } if(regex.isNull()) { errln("LocalURegularExpressionPointer failure"); return; } #endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ #if !UCONFIG_NO_TRANSLITERATION UnicodeString id=UNICODE_STRING_SIMPLE("Grek-Latn"); LocalUTransliteratorPointer trans( utrans_openU(id.getBuffer(), id.length(), UTRANS_FORWARD, NULL, 0, NULL, errorCode)); if(errorCode.logIfFailureAndReset("utrans_open()")) { return; } if(trans.isNull()) { errln("LocalUTransliteratorPointer failure"); return; } #endif /* !UCONFIG_NO_TRANSLITERATION */ // destructors }
//------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // //------------------------------------------------------------------------ void RBBISetBuilder::build() { RBBINode *usetNode; RangeDescriptor *rlRange; if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) { printSets(); } // // Initialize the process by creating a single range encompassing all characters // that is in no sets. // fRangeList = new RangeDescriptor(*fStatus); // will check for status here fRangeList->fStartChar = 0; fRangeList->fEndChar = 0x10ffff; if (U_FAILURE(*fStatus)) { return; } // // Find the set of non-overlapping ranges of characters // int ni; for (ni=0; ; ni++) { usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; int32_t inputSetRangeCount = inputSet->getRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange->fEndChar < inputSetRangeBegin) { rlRange = rlRange->fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range list range // in two, with one part being before (wholly outside of) the unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will then be skipped // over if (rlRange->fStartChar < inputSetRangeBegin) { rlRange->split(inputSetRangeBegin, *fStatus); if (U_FAILURE(*fStatus)) { return; } continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide with // the end of the range from the unicode set, split the range list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange->fEndChar > inputSetRangeEnd) { rlRange->split(inputSetRangeEnd+1, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // The current rlRange is now entirely within the UnicodeSet range. // Add this unicode set to the list of sets for this rlRange if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { rlRange->fIncludesSets->addElement(usetNode, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange->fEndChar) { inputSetRangeIndex++; } rlRange = rlRange->fNext; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges(); } // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // RangeDescriptor *rlSearchRange; for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { rlRange->fNum = rlSearchRange->fNum; break; } } if (rlRange->fNum == 0) { fGroupCount ++; rlRange->fNum = fGroupCount; rlRange->setDictionaryFlag(); addValToSets(rlRange->fIncludesSets, fGroupCount); } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) { printRangeGroups(); } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) { printSets(); } // // Build the Trie table for mapping UChar32 values to the corresponding // range group number // fTrie = utrie_open(NULL, // Pre-existing trie to be filled in NULL, // Data array (utrie will allocate one) 100000, // Max Data Length 0, // Initial value for all code points 0, // Lead surrogate unit value TRUE); // Keep Latin 1 in separately for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); } }