//---------------------------------------------------------------------------------------- // // findSetFor given a UnicodeString, // - find the corresponding Unicode Set (uset node) // (create one if necessary) // - Set fLeftChild of the caller's node (should be a setRef node) // to the uset node // Maintain a hash table of uset nodes, so the same one is always used // for the same string. // If a "to adopt" set is provided and we haven't seen this key before, // add the provided set to the hash table. // If the string is one (32 bit) char in length, the set contains // just one element which is the char in question. // If the string is "any", return a set containing all chars. // //---------------------------------------------------------------------------------------- void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { RBBISetTableEl *el; // First check whether we've already cached a set for this string. // If so, just use the cached set in the new node. // delete any set provided by the caller, since we own it. el = (RBBISetTableEl *)uhash_get(fSetTable, &s); if (el != NULL) { delete setToAdopt; node->fLeftChild = el->val; U_ASSERT(node->fLeftChild->fType == RBBINode::uset); return; } // Haven't seen this set before. // If the caller didn't provide us with a prebuilt set, // create a new UnicodeSet now. if (setToAdopt == NULL) { if (s.compare(kAny, -1) == 0) { setToAdopt = new UnicodeSet(0x000000, 0x10ffff); } else { UChar32 c; c = s.char32At(0); setToAdopt = new UnicodeSet(c, c); } } // // Make a new uset node to refer to this UnicodeSet // This new uset node becomes the child of the caller's setReference node. // RBBINode *usetNode = new RBBINode(RBBINode::uset); usetNode->fInputSet = setToAdopt; usetNode->fParent = node; node->fLeftChild = usetNode; usetNode->fText = s; // // Add the new uset node to the list of all uset nodes. // fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); // // Add the new set to the set hash table. // el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); UnicodeString *tkey = new UnicodeString(s); if (tkey == NULL || el == NULL || setToAdopt == NULL) { error(U_MEMORY_ALLOCATION_ERROR); return; } el->key = tkey; el->val = usetNode; uhash_put(fSetTable, el->key, el, fRB->fStatus); return; }
void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) { if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) { dest = name; return; } UErrorCode status = U_ZERO_ERROR; initPinyinBounds(col, status); if (U_FAILURE(status)) { dest = name; return; } // TODO: use binary search int index; for (index=0; ; index++) { if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) { index--; break; } int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1)); if (compareResult < 0) { index--; } if (compareResult <= 0) { break; } } UChar c = PINYIN_LOWER_BOUNDS[index]; dest.setTo(c); dest.append(name); return; }
void AccumulativeWordCounter::operator+=(const UnicodeString& ustr) { for(int32_t i=0; i<ustr.length(); i=ustr.moveIndex32(i,1)) { this->operator+=(ustr.char32At(i)); } }
U_I18N_API UnicodeString & U_EXPORT2 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, uint32_t /*type*/, const UnicodeString &id, UnicodeString &dest, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return dest; } UnicodeString nfdId; gNfdNormalizer->normalize(id, nfdId, *status); // Apply the skeleton mapping to the NFD normalized input string // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. int32_t inputIndex = 0; UnicodeString skelStr; int32_t normalizedLen = nfdId.length(); for (inputIndex=0; inputIndex < normalizedLen; ) { UChar32 c = nfdId.char32At(inputIndex); inputIndex += U16_LENGTH(c); This->fSpoofData->confusableLookup(c, skelStr); } gNfdNormalizer->normalize(skelStr, dest, *status); return dest; }
UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) { // assert(pos < str.length()); // assert(!uprv_isRuleWhiteSpace(str.char32At(pos))); UnicodeString buf; int p = pos; while (p < str.length()) { UChar32 ch = str.char32At(p); if (buf.length() == 0) { if (u_isIDStart(ch)) { buf.append(ch); } else { buf.truncate(0); return buf; } } else { if (u_isIDPart(ch)) { buf.append(ch); } else { break; } } p += UTF_CHAR_LENGTH(ch); } pos = p; return buf; }
int32_t ICU_Utility::parseNumber(const UnicodeString& text, int32_t& pos, int8_t radix) { // assert(pos[0] >= 0); // assert(radix >= 2); // assert(radix <= 36); int32_t n = 0; int32_t p = pos; while (p < text.length()) { UChar32 ch = text.char32At(p); int32_t d = u_digit(ch, radix); if (d < 0) { break; } n = radix*n + d; // ASSUME that when a 32-bit integer overflows it becomes // negative. E.g., 214748364 * 10 + 8 => negative value. if (n < 0) { return -1; } ++p; } if (p == pos) { return -1; } pos = p; return n; }
//--------------------------------------------------------------------------------------- // // wholeScriptCheck() // // Input text is already normalized to NFD // Return the set of scripts, each of which can represent something that is // confusable with the input text. The script of the input text // is included; input consisting of characters from a single script will // always produce a result consisting of a set containing that script. // //--------------------------------------------------------------------------------------- void SpoofImpl::wholeScriptCheck( const UnicodeString &text, ScriptSet *result, UErrorCode &status) const { UTrie2 *table = (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; result->setAll(); int32_t length = text.length(); for (int32_t inputIdx=0; inputIdx < length;) { UChar32 c = text.char32At(inputIdx); inputIdx += U16_LENGTH(c); uint32_t index = utrie2_get32(table, c); if (index == 0) { // No confusables in another script for this char. // TODO: we should change the data to have sets with just the single script // bit for the script of this char. Gets rid of this special case. // Until then, grab the script from the char and intersect it with the set. UScriptCode cpScript = uscript_getScript(c, &status); U_ASSERT(cpScript > USCRIPT_INHERITED); result->intersect(cpScript, status); } else if (index == 1) { // Script == Common or Inherited. Nothing to do. } else { result->intersect(fSpoofData->fScriptSets[index]); } } }
void TextTrieMap::search(CharacterNode *node, const UnicodeString &text, int32_t start, int32_t index, TextTrieMapSearchResultHandler *handler, UErrorCode &status) const { if (U_FAILURE(status)) { return; } if (node->hasValues()) { if (!handler->handleMatch(index - start, node, status)) { return; } if (U_FAILURE(status)) { return; } } UChar32 c = text.char32At(index); if (fIgnoreCase) { // size of character may grow after fold operation UnicodeString tmp(c); tmp.foldCase(); int32_t tmpidx = 0; while (tmpidx < tmp.length()) { c = tmp.char32At(tmpidx); node = getChildNode(node, c); if (node == NULL) { break; } tmpidx = tmp.moveIndex32(tmpidx, 1); } } else { node = getChildNode(node, c); } if (node != NULL) { search(node, text, start, index+1, handler, status); } }
void StaticUnicodeSetsTest::assertInSet(const UnicodeString &localeName, const UnicodeString &setName, const UnicodeSet &set, const UnicodeString &str) { if (str.countChar32(0, str.length()) != 1) { // Ignore locale strings with more than one code point (usually a bidi mark) return; } assertInSet(localeName, setName, set, str.char32At(0)); }
void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) { UChar32 c = 0; int32_t len = word.length(); for (int32_t i = 0; i < len; i += U16_LENGTH(c)) { c = word.char32At(i); buf.append(transform(c, errorCode), errorCode); } }
int32_t toUChar32(UnicodeString& u, UChar32 *c, UErrorCode& status) { #if U_ICU_VERSION_HEX >= 0x04020000 return u.toUTF32(c, 1, status); #else int32_t len = u.length(); if (len >= 1) *c = u.char32At(0); return len; #endif }
bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const { if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { return true; } UnicodeString skelStr; fSpoofData->confusableLookup(cp, skelStr); UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1)); if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { return true; } return false; }
// Computes the set of numerics for a string, according to UTS 39 section 5.3. void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const { result.clear(); UChar32 codePoint; for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { codePoint = input.char32At(i); // Store a representative character for each kind of decimal digit if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) { // Store the zero character as a representative for comparison. // Unicode guarantees it is codePoint - value result.add(codePoint - (UChar32)u_getNumericValue(codePoint)); } } }
void AlphabeticIndexTest::HackPinyinTest() { UErrorCode status = U_ZERO_ERROR; AlphabeticIndex aindex(Locale::createFromName("zh"), status); TEST_CHECK_STATUS; UnicodeString names[sizeof(pinyinTestData) / sizeof(pinyinTestData[0])]; int32_t nameCount; for (nameCount=0; pinyinTestData[nameCount] != NULL; nameCount++) { names[nameCount] = UnicodeString(pinyinTestData[nameCount], -1, UnicodeString::kInvariant).unescape(); aindex.addRecord(names[nameCount], &names[nameCount], status); TEST_CHECK_STATUS; if (U_FAILURE(status)) { return; } } TEST_ASSERT(nameCount == aindex.getRecordCount(status)); // Weak checking: make sure that none of the Chinese names landed in the overflow bucket // of the index, and that the names are distributed among several buckets. // (Exact expected data would be subject to change with evolution of the collation rules.) int32_t bucketCount = 0; int32_t filledBucketCount = 0; while (aindex.nextBucket(status)) { bucketCount++; UnicodeString label = aindex.getBucketLabel(); // std::string s; // std::cout << label.toUTF8String(s) << ": "; UBool bucketHasContents = FALSE; while (aindex.nextRecord(status)) { bucketHasContents = TRUE; UnicodeString name = aindex.getRecordName(); if (aindex.getBucketLabelType() != U_ALPHAINDEX_NORMAL) { errln("File %s, Line %d, Name \"\\u%x\" is in an under or overflow bucket.", __FILE__, __LINE__, name.char32At(0)); } // s.clear(); // std::cout << aindex.getRecordName().toUTF8String(s) << " "; } if (bucketHasContents) { filledBucketCount++; } // std::cout << std::endl; } TEST_ASSERT(bucketCount > 25); TEST_ASSERT(filledBucketCount > 15); }
void UnicodeTest::TestScriptMetadata() { IcuTestErrorCode errorCode(*this, "TestScriptMetadata()"); UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode); // So far, sample characters are uppercase. // Georgian is special. UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode); for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) { UScriptCode sc = (UScriptCode)sci; // Run the test with -v to see which script has failures: // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 3 FAIL logln(uscript_getShortName(sc)); UScriptUsage usage = uscript_getUsage(sc); UnicodeString sample = uscript_getSampleUnicodeString(sc); UnicodeSet scriptSet; scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode); if(usage == USCRIPT_USAGE_NOT_ENCODED) { assertTrue("not encoded, no sample", sample.isEmpty()); assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc)); assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc)); assertFalse("not encoded, not cased", uscript_isCased(sc)); assertTrue("not encoded, no characters", scriptSet.isEmpty()); } else { assertFalse("encoded, has a sample character", sample.isEmpty()); UChar32 firstChar = sample.char32At(0); UScriptCode charScript = getCharScript(sc); assertEquals("script(sample(script))", charScript, uscript_getScript(firstChar, errorCode)); assertEquals("RTL vs. set", rtl.contains(firstChar), uscript_isRightToLeft(sc)); assertEquals("cased vs. set", cased.contains(firstChar), uscript_isCased(sc)); assertEquals("encoded, has characters", sc == charScript, !scriptSet.isEmpty()); if(uscript_isRightToLeft(sc)) { rtl.removeAll(scriptSet); } if(uscript_isCased(sc)) { cased.removeAll(scriptSet); } } } UnicodeString pattern; assertEquals("no remaining RTL characters", UnicodeString("[]"), rtl.toPattern(pattern)); assertEquals("no remaining cased characters", UnicodeString("[]"), cased.toPattern(pattern)); assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN)); assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI)); assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN)); }
//------------------------------------------------------------------------------ // // stripRules Return a rules string without extra spaces. // (Comments are removed separately, during rule parsing.) // //------------------------------------------------------------------------------ UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { UnicodeString strippedRules; int32_t rulesLength = rules.length(); bool skippingSpaces = false; for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) { UChar32 cp = rules.char32At(idx); bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE); if (skippingSpaces && whiteSpace) { continue; } strippedRules.append(cp); skippingSpaces = whiteSpace; } return strippedRules; }
U_I18N_API UnicodeString & U_EXPORT2 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, uint32_t type, const UnicodeString &id, UnicodeString &dest, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return dest; } int32_t tableMask = 0; switch (type) { case 0: tableMask = USPOOF_ML_TABLE_FLAG; break; case USPOOF_SINGLE_SCRIPT_CONFUSABLE: tableMask = USPOOF_SL_TABLE_FLAG; break; case USPOOF_ANY_CASE: tableMask = USPOOF_MA_TABLE_FLAG; break; case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE: tableMask = USPOOF_SA_TABLE_FLAG; break; default: *status = U_ILLEGAL_ARGUMENT_ERROR; return dest; } UnicodeString nfdId; gNfdNormalizer->normalize(id, nfdId, *status); // Apply the skeleton mapping to the NFD normalized input string // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. int32_t inputIndex = 0; UnicodeString skelStr; int32_t normalizedLen = nfdId.length(); for (inputIndex=0; inputIndex < normalizedLen; ) { UChar32 c = nfdId.char32At(inputIndex); inputIndex += U16_LENGTH(c); This->confusableLookup(c, tableMask, skelStr); } gNfdNormalizer->normalize(skelStr, dest, *status); return dest; }
/** * Convert a string to an unsigned decimal, ignoring rule whitespace. * @return a non-negative number if successful, or a negative number * upon failure. */ static int32_t stou(const UnicodeString& string) { int32_t n = 0; int32_t count = 0; UChar32 c; for (int32_t i=0; i<string.length(); i+=U16_LENGTH(c)) { c = string.char32At(i); if (uprv_isRuleWhiteSpace(c)) { continue; } int32_t d = u_digit(c, 10); if (d < 0 || ++count > 10) { return -1; } n = 10*n + d; } return n; }
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { UnicodeString result; if (item.length() == 0) { return result; } int32_t i = 0; for (;;) { UChar32 cp = item.char32At(i); result.append(cp); i = item.moveIndex32(i, 1); if (i >= item.length()) { break; } result.append(CGJ); } return result; }
int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const { bool sawLeadCharacter = false; for (int32_t i=0; i<input.length();) { UChar32 cp = input.char32At(i); if (sawLeadCharacter && cp == 0x0307) { return i; } uint8_t combiningClass = u_getCombiningClass(cp); // Skip over characters except for those with combining class 0 (non-combining characters) or with // combining class 230 (same class as U+0307) U_ASSERT(u_getCombiningClass(0x0307) == 230); if (combiningClass == 0 || combiningClass == 230) { sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp); } i += U16_LENGTH(cp); } return -1; }
// Computes the resolved script set for a string, omitting characters having the specified script. // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { result.setAll(); ScriptSet temp; UChar32 codePoint; for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { codePoint = input.char32At(i); // Compute the augmented script set for the character getAugmentedScriptSet(codePoint, temp, status); if (U_FAILURE(status)) { return; } // Intersect the augmented script set with the resolved script set, but only if the character doesn't // have the script specified in the function call if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { result.intersect(temp); } } }
void CountWords(const UnicodeString& ustr, size_t& cnt, size_t& ctrl_cnt, size_t& sp_cnt) { UErrorCode status = U_ZERO_ERROR; boost::scoped_ptr<BreakIterator> bi ( BreakIterator::createWordInstance(Locale::getDefault(), status) ); bi->setText(ustr); int32_t i = bi->first(); while (i < ustr.length()) { ++cnt; UChar32 ch = ustr.char32At(i); if (u_iscntrl(ch)) ++ctrl_cnt; else if(u_isspace(ch)) ++sp_cnt; i = bi->next(); } }
UnicodeString& Transliterator::toRules(UnicodeString& rulesSource, UBool escapeUnprintable) const { // The base class implementation of toRules munges the ID into // the correct format. That is: foo => ::foo if (escapeUnprintable) { rulesSource.truncate(0); UnicodeString id = getID(); for (int32_t i=0; i<id.length();) { UChar32 c = id.char32At(i); if (!ICU_Utility::escapeUnprintable(rulesSource, c)) { rulesSource.append(c); } i += UTF_CHAR_LENGTH(c); } } else { rulesSource = getID(); } // KEEP in sync with rbt_pars rulesSource.insert(0, UNICODE_STRING_SIMPLE("::")); rulesSource.append(ID_DELIM); return rulesSource; }
static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { UnicodeString nfd; UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode); if(U_FAILURE(errorCode)) { return FALSE; } if(nfcNorm2->getDecomposition(c, nfd)) { /* c has a decomposition */ if(nfd.length()==1) { c=nfd[0]; /* single BMP code point */ } else if(nfd.length()<=U16_MAX_LENGTH && nfd.length()==U16_LENGTH(c=nfd.char32At(0)) ) { /* single supplementary code point */ } else { c=U_SENTINEL; } } else if(c<0) { return FALSE; /* protect against bad input */ } if(c>=0) { /* single code point */ const UCaseProps *csp=ucase_getSingleton(); const UChar *resultString; return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0); } else { /* guess some large but stack-friendly capacity */ UChar dest[2*UCASE_MAX_STRING_LENGTH]; int32_t destLength; destLength=u_strFoldCase(dest, LENGTHOF(dest), nfd.getBuffer(), nfd.length(), U_FOLD_CASE_DEFAULT, &errorCode); return (UBool)(U_SUCCESS(errorCode) && 0!=u_strCompare(nfd.getBuffer(), nfd.length(), dest, destLength, FALSE)); } }
U_CAPI int32_t U_EXPORT2 uspoof_checkUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &id, int32_t *position, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (This == NULL) { return 0; } int32_t result = 0; IdentifierInfo *identifierInfo = NULL; if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) { identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { goto cleanupAndReturn; } identifierInfo->setIdentifier(id, *status); identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet); } if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) { URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status); if (idRestrictionLevel > This->fRestrictionLevel) { result |= USPOOF_RESTRICTION_LEVEL; } if (This->fChecks & USPOOF_AUX_INFO) { result |= idRestrictionLevel; } } if ((This->fChecks) & USPOOF_MIXED_NUMBERS) { const UnicodeSet *numerics = identifierInfo->getNumerics(); if (numerics->size() > 1) { result |= USPOOF_MIXED_NUMBERS; } // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier. // We have no easy way to do the same in C. // if (checkResult != null) { // checkResult.numerics = numerics; // } } if (This->fChecks & (USPOOF_CHAR_LIMIT)) { int32_t i; UChar32 c; int32_t length = id.length(); for (i=0; i<length ;) { c = id.char32At(i); i += U16_LENGTH(c); if (!This->fAllowedCharsSet->contains(c)) { result |= USPOOF_CHAR_LIMIT; break; } } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { // These are the checks that need to be done on NFD input UnicodeString nfdText; gNfdNormalizer->normalize(id, nfdText, *status); int32_t nfdLength = nfdText.length(); if (This->fChecks & USPOOF_INVISIBLE) { // scan for more than one occurence of the same non-spacing mark // in a sequence of non-spacing marks. int32_t i; UChar32 c; UChar32 firstNonspacingMark = 0; UBool haveMultipleMarks = FALSE; UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; i<nfdLength ;) { c = nfdText.char32At(i); i += U16_LENGTH(c); if (u_charType(c) != U_NON_SPACING_MARK) { firstNonspacingMark = 0; if (haveMultipleMarks) { marksSeenSoFar.clear(); haveMultipleMarks = FALSE; } continue; } if (firstNonspacingMark == 0) { firstNonspacingMark = c; continue; } if (!haveMultipleMarks) { marksSeenSoFar.add(firstNonspacingMark); haveMultipleMarks = TRUE; } if (marksSeenSoFar.contains(c)) { // report the error, and stop scanning. // No need to find more than the first failure. result |= USPOOF_INVISIBLE; break; } marksSeenSoFar.add(c); } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { // The basic test is the same for both whole and mixed script confusables. // Compute the set of scripts that every input character has a confusable in. // For this computation an input character is always considered to be // confusable with itself in its own script. // // If the number of such scripts is two or more, and the input consisted of // characters all from a single script, we have a whole script confusable. // (The two scripts will be the original script and the one that is confusable) // // If the number of such scripts >= one, and the original input contained characters from // more than one script, we have a mixed script confusable. (We can transform // some of the characters, and end up with a visually similar string all in // one script.) if (identifierInfo == NULL) { identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { goto cleanupAndReturn; } identifierInfo->setIdentifier(id, *status); } int32_t scriptCount = identifierInfo->getScriptCount(); ScriptSet scripts; This->wholeScriptCheck(nfdText, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && confusableScriptCount >= 2 && scriptCount == 1) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && confusableScriptCount >= 1 && scriptCount > 1) { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; } } } cleanupAndReturn: This->releaseIdentifierInfo(identifierInfo); if (position != NULL) { *position = 0; } return result; }
UnicodeString& RelativeDateFormat::format( Calendar& cal, UnicodeString& appendTo, FieldPosition& pos) const { UErrorCode status = U_ZERO_ERROR; UnicodeString relativeDayString; UDisplayContext capitalizationContext = getContext(UDISPCTX_TYPE_CAPITALIZATION, status); // calculate the difference, in days, between 'cal' and now. int dayDiff = dayDifference(cal, status); // look up string int32_t len = 0; const UChar *theString = getStringForDay(dayDiff, len, status); if(U_SUCCESS(status) && (theString!=NULL)) { // found a relative string relativeDayString.setTo(theString, len); } if ( relativeDayString.length() > 0 && !fDatePattern.isEmpty() && (fTimePattern.isEmpty() || fCombinedFormat == NULL || fCombinedHasDateAtStart)) { #if !UCONFIG_NO_BREAK_ITERATION // capitalize relativeDayString according to context for relative, set formatter no context if ( u_islower(relativeDayString.char32At(0)) && fCapitalizationBrkIter!= NULL && ( capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU && fCapitalizationOfRelativeUnitsForUIListMenu) || (capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_STANDALONE && fCapitalizationOfRelativeUnitsForStandAlone) ) ) { // titlecase first word of relativeDayString relativeDayString.toTitle(fCapitalizationBrkIter, fLocale, U_TITLECASE_NO_LOWERCASE | U_TITLECASE_NO_BREAK_ADJUSTMENT); } #endif fDateTimeFormatter->setContext(UDISPCTX_CAPITALIZATION_NONE, status); } else { // set our context for the formatter fDateTimeFormatter->setContext(capitalizationContext, status); } if (fDatePattern.isEmpty()) { fDateTimeFormatter->applyPattern(fTimePattern); fDateTimeFormatter->format(cal,appendTo,pos); } else if (fTimePattern.isEmpty() || fCombinedFormat == NULL) { if (relativeDayString.length() > 0) { appendTo.append(relativeDayString); } else { fDateTimeFormatter->applyPattern(fDatePattern); fDateTimeFormatter->format(cal,appendTo,pos); } } else { UnicodeString datePattern; if (relativeDayString.length() > 0) { // Need to quote the relativeDayString to make it a legal date pattern relativeDayString.findAndReplace(UNICODE_STRING("'", 1), UNICODE_STRING("''", 2)); // double any existing APOSTROPHE relativeDayString.insert(0, APOSTROPHE); // add APOSTROPHE at beginning... relativeDayString.append(APOSTROPHE); // and at end datePattern.setTo(relativeDayString); } else { datePattern.setTo(fDatePattern); } UnicodeString combinedPattern; fCombinedFormat->format(fTimePattern, datePattern, combinedPattern, status); fDateTimeFormatter->applyPattern(combinedPattern); fDateTimeFormatter->format(cal,appendTo,pos); } return appendTo; }
// testConfData - Check each data item from the Unicode confusables.txt file, // verify that it transforms correctly in a skeleton. // void IntlTestSpoof::testConfData() { UErrorCode status = U_ZERO_ERROR; const char *testDataDir = IntlTest::getSourceTestData(status); TEST_ASSERT_SUCCESS(status); char buffer[2000]; uprv_strcpy(buffer, testDataDir); uprv_strcat(buffer, "confusables.txt"); LocalStdioFilePointer f(fopen(buffer, "rb")); if (f.isNull()) { errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); return; } fseek(f.getAlias(), 0, SEEK_END); int32_t fileSize = ftell(f.getAlias()); LocalArray<char> fileBuf(new char[fileSize]); fseek(f.getAlias(), 0, SEEK_SET); int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias()); TEST_ASSERT_EQ(amt_read, fileSize); TEST_ASSERT(fileSize>0); if (amt_read != fileSize || fileSize <=0) { return; } UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); LocalUSpoofCheckerPointer sc(uspoof_open(&status)); TEST_ASSERT_SUCCESS(status); // Parse lines from the confusables.txt file. Example Line: // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... // Three fields. The hex fields can contain more than one character, // and each character may be more than 4 digits (for supplemntals) // This regular expression matches lines and splits the fields into capture groups. RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); TEST_ASSERT_SUCCESS(status); while (parseLine.find()) { UnicodeString from = parseHex(parseLine.group(1, status)); if (!Normalizer::isNormalized(from, UNORM_NFD, status)) { // The source character was not NFD. // Skip this case; the first step in obtaining a skeleton is to NFD the input, // so the mapping in this line of confusables.txt will never be applied. continue; } UnicodeString rawExpected = parseHex(parseLine.group(2, status)); UnicodeString expected; Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status); TEST_ASSERT_SUCCESS(status); int32_t skeletonType = 0; UnicodeString tableType = parseLine.group(3, status); TEST_ASSERT_SUCCESS(status); if (tableType.indexOf("SL") >= 0) { skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; } else if (tableType.indexOf("SA") >= 0) { skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; } else if (tableType.indexOf("ML") >= 0) { skeletonType = 0; } else if (tableType.indexOf("MA") >= 0) { skeletonType = USPOOF_ANY_CASE; } UnicodeString actual; uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(actual == expected); if (actual != expected) { errln(parseLine.group(0, status)); UnicodeString line = "Actual: "; int i = 0; while (i < actual.length()) { appendHexUChar(line, actual.char32At(i)); i = actual.moveIndex32(i, 1); } errln(line); } if (U_FAILURE(status)) { break; } } }
virtual UChar32 getChar32At(int32_t offset) const{ return chars.char32At(offset); }
void DecimalFormatPatternParser::applyPatternWithoutExpandAffix( const UnicodeString& pattern, DecimalFormatPattern& out, UParseError& parseError, UErrorCode& status) { if (U_FAILURE(status)) { return; } out = DecimalFormatPattern(); // Clear error struct parseError.offset = -1; parseError.preContext[0] = parseError.postContext[0] = (UChar)0; // TODO: Travis Keep: This won't always work. UChar nineDigit = (UChar)(fZeroDigit + 9); int32_t digitLen = fDigit.length(); int32_t groupSepLen = fGroupingSeparator.length(); int32_t decimalSepLen = fDecimalSeparator.length(); int32_t pos = 0; int32_t patLen = pattern.length(); // Part 0 is the positive pattern. Part 1, if present, is the negative // pattern. for (int32_t part=0; part<2 && pos<patLen; ++part) { // The subpart ranges from 0 to 4: 0=pattern proper, 1=prefix, // 2=suffix, 3=prefix in quote, 4=suffix in quote. Subpart 0 is // between the prefix and suffix, and consists of pattern // characters. In the prefix and suffix, percent, perMill, and // currency symbols are recognized and translated. int32_t subpart = 1, sub0Start = 0, sub0Limit = 0, sub2Limit = 0; // It's important that we don't change any fields of this object // prematurely. We set the following variables for the multiplier, // grouping, etc., and then only change the actual object fields if // everything parses correctly. This also lets us register // the data from part 0 and ignore the part 1, except for the // prefix and suffix. UnicodeString prefix; UnicodeString suffix; int32_t decimalPos = -1; int32_t multiplier = 1; int32_t digitLeftCount = 0, zeroDigitCount = 0, digitRightCount = 0, sigDigitCount = 0; int8_t groupingCount = -1; int8_t groupingCount2 = -1; int32_t padPos = -1; UChar32 padChar = 0; int32_t roundingPos = -1; DigitList roundingInc; int8_t expDigits = -1; UBool expSignAlways = FALSE; // The affix is either the prefix or the suffix. UnicodeString* affix = &prefix; int32_t start = pos; UBool isPartDone = FALSE; UChar32 ch; for (; !isPartDone && pos < patLen; ) { // Todo: account for surrogate pairs ch = pattern.char32At(pos); switch (subpart) { case 0: // Pattern proper subpart (between prefix & suffix) // Process the digits, decimal, and grouping characters. We // record five pieces of information. We expect the digits // to occur in the pattern ####00.00####, and we record the // number of left digits, zero (central) digits, and right // digits. The position of the last grouping character is // recorded (should be somewhere within the first two blocks // of characters), as is the position of the decimal point, // if any (should be in the zero digits). If there is no // decimal point, then there should be no right digits. if (pattern.compare(pos, digitLen, fDigit) == 0) { if (zeroDigitCount > 0 || sigDigitCount > 0) { ++digitRightCount; } else { ++digitLeftCount; } if (groupingCount >= 0 && decimalPos < 0) { ++groupingCount; } pos += digitLen; } else if ((ch >= fZeroDigit && ch <= nineDigit) || ch == fSigDigit) { if (digitRightCount > 0) { // Unexpected '0' debug("Unexpected '0'") status = U_UNEXPECTED_TOKEN; syntaxError(pattern,pos,parseError); return; } if (ch == fSigDigit) { ++sigDigitCount; } else { if (ch != fZeroDigit && roundingPos < 0) { roundingPos = digitLeftCount + zeroDigitCount; } if (roundingPos >= 0) { roundingInc.append((char)(ch - fZeroDigit + '0')); } ++zeroDigitCount; } if (groupingCount >= 0 && decimalPos < 0) { ++groupingCount; } pos += U16_LENGTH(ch); } else if (pattern.compare(pos, groupSepLen, fGroupingSeparator) == 0) { if (decimalPos >= 0) { // Grouping separator after decimal debug("Grouping separator after decimal") status = U_UNEXPECTED_TOKEN; syntaxError(pattern,pos,parseError); return; } groupingCount2 = groupingCount; groupingCount = 0; pos += groupSepLen; } else if (pattern.compare(pos, decimalSepLen, fDecimalSeparator) == 0) { if (decimalPos >= 0) { // Multiple decimal separators debug("Multiple decimal separators") status = U_MULTIPLE_DECIMAL_SEPARATORS; syntaxError(pattern,pos,parseError); return; } // Intentionally incorporate the digitRightCount, // even though it is illegal for this to be > 0 // at this point. We check pattern syntax below. decimalPos = digitLeftCount + zeroDigitCount + digitRightCount; pos += decimalSepLen; } else { if (pattern.compare(pos, fExponent.length(), fExponent) == 0) { if (expDigits >= 0) { // Multiple exponential symbols debug("Multiple exponential symbols") status = U_MULTIPLE_EXPONENTIAL_SYMBOLS; syntaxError(pattern,pos,parseError); return; } if (groupingCount >= 0) { // Grouping separator in exponential pattern debug("Grouping separator in exponential pattern") status = U_MALFORMED_EXPONENTIAL_PATTERN; syntaxError(pattern,pos,parseError); return; } pos += fExponent.length(); // Check for positive prefix if (pos < patLen && pattern.compare(pos, fPlus.length(), fPlus) == 0) { expSignAlways = TRUE; pos += fPlus.length(); } // Use lookahead to parse out the exponential part of the // pattern, then jump into suffix subpart. expDigits = 0; while (pos < patLen && pattern.char32At(pos) == fZeroDigit) { ++expDigits; pos += U16_LENGTH(fZeroDigit); } // 1. Require at least one mantissa pattern digit // 2. Disallow "#+ @" in mantissa // 3. Require at least one exponent pattern digit if (((digitLeftCount + zeroDigitCount) < 1 && (sigDigitCount + digitRightCount) < 1) || (sigDigitCount > 0 && digitLeftCount > 0) || expDigits < 1) { // Malformed exponential pattern debug("Malformed exponential pattern") status = U_MALFORMED_EXPONENTIAL_PATTERN; syntaxError(pattern,pos,parseError); return; } } // Transition to suffix subpart subpart = 2; // suffix subpart affix = &suffix; sub0Limit = pos; continue; } break; case 1: // Prefix subpart case 2: // Suffix subpart // Process the prefix / suffix characters // Process unquoted characters seen in prefix or suffix // subpart. // Several syntax characters implicitly begins the // next subpart if we are in the prefix; otherwise // they are illegal if unquoted. if (!pattern.compare(pos, digitLen, fDigit) || !pattern.compare(pos, groupSepLen, fGroupingSeparator) || !pattern.compare(pos, decimalSepLen, fDecimalSeparator) || (ch >= fZeroDigit && ch <= nineDigit) || ch == fSigDigit) { if (subpart == 1) { // prefix subpart subpart = 0; // pattern proper subpart sub0Start = pos; // Reprocess this character continue; } else { status = U_UNQUOTED_SPECIAL; syntaxError(pattern,pos,parseError); return; } } else if (ch == kCurrencySign) { affix->append(kQuote); // Encode currency // Use lookahead to determine if the currency sign is // doubled or not. U_ASSERT(U16_LENGTH(kCurrencySign) == 1); if ((pos+1) < pattern.length() && pattern[pos+1] == kCurrencySign) { affix->append(kCurrencySign); ++pos; // Skip over the doubled character if ((pos+1) < pattern.length() && pattern[pos+1] == kCurrencySign) { affix->append(kCurrencySign); ++pos; // Skip over the doubled character out.fCurrencySignCount = fgCurrencySignCountInPluralFormat; } else { out.fCurrencySignCount = fgCurrencySignCountInISOFormat; } } else { out.fCurrencySignCount = fgCurrencySignCountInSymbolFormat; } // Fall through to append(ch) } else if (ch == kQuote) { // A quote outside quotes indicates either the opening // quote or two quotes, which is a quote literal. That is, // we have the first quote in 'do' or o''clock. U_ASSERT(U16_LENGTH(kQuote) == 1); ++pos; if (pos < pattern.length() && pattern[pos] == kQuote) { affix->append(kQuote); // Encode quote // Fall through to append(ch) } else { subpart += 2; // open quote continue; } } else if (pattern.compare(pos, fSeparator.length(), fSeparator) == 0) { // Don't allow separators in the prefix, and don't allow // separators in the second pattern (part == 1). if (subpart == 1 || part == 1) { // Unexpected separator debug("Unexpected separator") status = U_UNEXPECTED_TOKEN; syntaxError(pattern,pos,parseError); return; } sub2Limit = pos; isPartDone = TRUE; // Go to next part pos += fSeparator.length(); break; } else if (pattern.compare(pos, fPercent.length(), fPercent) == 0) { // Next handle characters which are appended directly. if (multiplier != 1) { // Too many percent/perMill characters debug("Too many percent characters") status = U_MULTIPLE_PERCENT_SYMBOLS; syntaxError(pattern,pos,parseError); return; } affix->append(kQuote); // Encode percent/perMill affix->append(kPatternPercent); // Use unlocalized pattern char multiplier = 100; pos += fPercent.length(); break; } else if (pattern.compare(pos, fPerMill.length(), fPerMill) == 0) { // Next handle characters which are appended directly. if (multiplier != 1) { // Too many percent/perMill characters debug("Too many perMill characters") status = U_MULTIPLE_PERMILL_SYMBOLS; syntaxError(pattern,pos,parseError); return; } affix->append(kQuote); // Encode percent/perMill affix->append(kPatternPerMill); // Use unlocalized pattern char multiplier = 1000; pos += fPerMill.length(); break; } else if (pattern.compare(pos, fPadEscape.length(), fPadEscape) == 0) { if (padPos >= 0 || // Multiple pad specifiers (pos+1) == pattern.length()) { // Nothing after padEscape debug("Multiple pad specifiers") status = U_MULTIPLE_PAD_SPECIFIERS; syntaxError(pattern,pos,parseError); return; } padPos = pos; pos += fPadEscape.length(); padChar = pattern.char32At(pos); pos += U16_LENGTH(padChar); break; } else if (pattern.compare(pos, fMinus.length(), fMinus) == 0) { affix->append(kQuote); // Encode minus affix->append(kPatternMinus); pos += fMinus.length(); break; } else if (pattern.compare(pos, fPlus.length(), fPlus) == 0) { affix->append(kQuote); // Encode plus affix->append(kPatternPlus); pos += fPlus.length(); break; } // Unquoted, non-special characters fall through to here, as // well as other code which needs to append something to the // affix. affix->append(ch); pos += U16_LENGTH(ch); break; case 3: // Prefix subpart, in quote case 4: // Suffix subpart, in quote // A quote within quotes indicates either the closing // quote or two quotes, which is a quote literal. That is, // we have the second quote in 'do' or 'don''t'. if (ch == kQuote) { ++pos; if (pos < pattern.length() && pattern[pos] == kQuote) { affix->append(kQuote); // Encode quote // Fall through to append(ch) } else { subpart -= 2; // close quote continue; } } affix->append(ch); pos += U16_LENGTH(ch); break; } } if (sub0Limit == 0) { sub0Limit = pattern.length(); } if (sub2Limit == 0) { sub2Limit = pattern.length(); } /* Handle patterns with no '0' pattern character. These patterns * are legal, but must be recodified to make sense. "##.###" -> * "#0.###". ".###" -> ".0##". * * We allow patterns of the form "####" to produce a zeroDigitCount * of zero (got that?); although this seems like it might make it * possible for format() to produce empty strings, format() checks * for this condition and outputs a zero digit in this situation. * Having a zeroDigitCount of zero yields a minimum integer digits * of zero, which allows proper round-trip patterns. We don't want * "#" to become "#0" when toPattern() is called (even though that's * what it really is, semantically). */ if (zeroDigitCount == 0 && sigDigitCount == 0 && digitLeftCount > 0 && decimalPos >= 0) { // Handle "###.###" and "###." and ".###" int n = decimalPos; if (n == 0) ++n; // Handle ".###" digitRightCount = digitLeftCount - n; digitLeftCount = n - 1; zeroDigitCount = 1; } // Do syntax checking on the digits, decimal points, and quotes. if ((decimalPos < 0 && digitRightCount > 0 && sigDigitCount == 0) || (decimalPos >= 0 && (sigDigitCount > 0 || decimalPos < digitLeftCount || decimalPos > (digitLeftCount + zeroDigitCount))) || groupingCount == 0 || groupingCount2 == 0 || (sigDigitCount > 0 && zeroDigitCount > 0) || subpart > 2) { // subpart > 2 == unmatched quote debug("Syntax error") status = U_PATTERN_SYNTAX_ERROR; syntaxError(pattern,pos,parseError); return; } // Make sure pad is at legal position before or after affix. if (padPos >= 0) { if (padPos == start) { padPos = DecimalFormatPattern::kPadBeforePrefix; } else if (padPos+2 == sub0Start) { padPos = DecimalFormatPattern::kPadAfterPrefix; } else if (padPos == sub0Limit) { padPos = DecimalFormatPattern::kPadBeforeSuffix; } else if (padPos+2 == sub2Limit) { padPos = DecimalFormatPattern::kPadAfterSuffix; } else { // Illegal pad position debug("Illegal pad position") status = U_ILLEGAL_PAD_POSITION; syntaxError(pattern,pos,parseError); return; } } if (part == 0) { out.fPosPatternsBogus = FALSE; out.fPosPrefixPattern = prefix; out.fPosSuffixPattern = suffix; out.fNegPatternsBogus = TRUE; out.fNegPrefixPattern.remove(); out.fNegSuffixPattern.remove(); out.fUseExponentialNotation = (expDigits >= 0); if (out.fUseExponentialNotation) { out.fMinExponentDigits = expDigits; } out.fExponentSignAlwaysShown = expSignAlways; int32_t digitTotalCount = digitLeftCount + zeroDigitCount + digitRightCount; // The effectiveDecimalPos is the position the decimal is at or // would be at if there is no decimal. Note that if // decimalPos<0, then digitTotalCount == digitLeftCount + // zeroDigitCount. int32_t effectiveDecimalPos = decimalPos >= 0 ? decimalPos : digitTotalCount; UBool isSigDig = (sigDigitCount > 0); out.fUseSignificantDigits = isSigDig; if (isSigDig) { out.fMinimumSignificantDigits = sigDigitCount; out.fMaximumSignificantDigits = sigDigitCount + digitRightCount; } else { int32_t minInt = effectiveDecimalPos - digitLeftCount; out.fMinimumIntegerDigits = minInt; out.fMaximumIntegerDigits = out.fUseExponentialNotation ? digitLeftCount + out.fMinimumIntegerDigits : gDefaultMaxIntegerDigits; out.fMaximumFractionDigits = decimalPos >= 0 ? (digitTotalCount - decimalPos) : 0; out.fMinimumFractionDigits = decimalPos >= 0 ? (digitLeftCount + zeroDigitCount - decimalPos) : 0; } out.fGroupingUsed = groupingCount > 0; out.fGroupingSize = (groupingCount > 0) ? groupingCount : 0; out.fGroupingSize2 = (groupingCount2 > 0 && groupingCount2 != groupingCount) ? groupingCount2 : 0; out.fMultiplier = multiplier; out.fDecimalSeparatorAlwaysShown = decimalPos == 0 || decimalPos == digitTotalCount; if (padPos >= 0) { out.fPadPosition = (DecimalFormatPattern::EPadPosition) padPos; // To compute the format width, first set up sub0Limit - // sub0Start. Add in prefix/suffix length later. // fFormatWidth = prefix.length() + suffix.length() + // sub0Limit - sub0Start; out.fFormatWidth = sub0Limit - sub0Start; out.fPad = padChar; } else { out.fFormatWidth = 0; } if (roundingPos >= 0) { out.fRoundingIncrementUsed = TRUE; roundingInc.setDecimalAt(effectiveDecimalPos - roundingPos); out.fRoundingIncrement = roundingInc; } else { out.fRoundingIncrementUsed = FALSE; } } else { out.fNegPatternsBogus = FALSE; out.fNegPrefixPattern = prefix; out.fNegSuffixPattern = suffix; } } if (pattern.length() == 0) { out.fNegPatternsBogus = TRUE; out.fNegPrefixPattern.remove(); out.fNegSuffixPattern.remove(); out.fPosPatternsBogus = FALSE; out.fPosPrefixPattern.remove(); out.fPosSuffixPattern.remove(); out.fMinimumIntegerDigits = 0; out.fMaximumIntegerDigits = kDoubleIntegerDigits; out.fMinimumFractionDigits = 0; out.fMaximumFractionDigits = kDoubleFractionDigits; out.fUseExponentialNotation = FALSE; out.fCurrencySignCount = fgCurrencySignCountZero; out.fGroupingUsed = FALSE; out.fGroupingSize = 0; out.fGroupingSize2 = 0; out.fMultiplier = 1; out.fDecimalSeparatorAlwaysShown = FALSE; out.fFormatWidth = 0; out.fRoundingIncrementUsed = FALSE; } // If there was no negative pattern, or if the negative pattern is // identical to the positive pattern, then prepend the minus sign to the // positive pattern to form the negative pattern. if (out.fNegPatternsBogus || (out.fNegPrefixPattern == out.fPosPrefixPattern && out.fNegSuffixPattern == out.fPosSuffixPattern)) { out.fNegPatternsBogus = FALSE; out.fNegSuffixPattern = out.fPosSuffixPattern; out.fNegPrefixPattern.remove(); out.fNegPrefixPattern.append(kQuote).append(kPatternMinus) .append(out.fPosPrefixPattern); } // TODO: Deprecate/Remove out.fNegSuffixPattern and 3 other fields. AffixPattern::parseAffixString( out.fNegSuffixPattern, out.fNegSuffixAffix, status); AffixPattern::parseAffixString( out.fPosSuffixPattern, out.fPosSuffixAffix, status); AffixPattern::parseAffixString( out.fNegPrefixPattern, out.fNegPrefixAffix, status); AffixPattern::parseAffixString( out.fPosPrefixPattern, out.fPosPrefixAffix, status); }
/** * Dumb recursive implementation of permutation. * TODO: optimize * @param source the string to find permutations for * @return the results in a set. */ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { if(U_FAILURE(status)) { return; } //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); int32_t i = 0; // optimization: // if zero or one character, just return a set with it // we check for length < 2 to keep from counting code points all the time if (source.length() <= 2 && source.countChar32() <= 1) { UnicodeString *toPut = new UnicodeString(source); /* test for NULL */ if (toPut == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } result->put(source, toPut, status); return; } // otherwise iterate through the string, and recursively permute all the other characters UChar32 cp; Hashtable subpermute(status); if(U_FAILURE(status)) { return; } subpermute.setValueDeleter(uprv_deleteUObject); for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { cp = source.char32At(i); const UHashElement *ne = NULL; int32_t el = UHASH_FIRST; UnicodeString subPermuteString = source; // optimization: // if the character is canonical combining class zero, // don't permute it if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); continue; } subpermute.removeAll(); // see what the permutations of the characters before and after this one are //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); /* Test for buffer overflows */ if(U_FAILURE(status)) { return; } // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents // of source at this point. // prefix this character to all of them ne = subpermute.nextElement(el); while (ne != NULL) { UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); UnicodeString *chStr = new UnicodeString(cp); //test for NULL if (chStr == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); result->put(*chStr, chStr, status); ne = subpermute.nextElement(el); } } //return result; }