String SVGFontData::createStringWithMirroredCharacters(const UChar* characters, unsigned length) const { StringBuilder mirroredCharacters; mirroredCharacters.reserveCapacity(length); UChar32 character; unsigned i = 0; while (i < length) { U16_NEXT(characters, i, length, character); character = mirroredChar(character); if (U16_LENGTH(character) == 1) mirroredCharacters.append(static_cast<UChar>(character)); else { mirroredCharacters.append(U16_LEAD(character)); mirroredCharacters.append(U16_TRAIL(character)); } } return mirroredCharacters.toString(); }
/** * Transliterate the given text with the given UTransPosition * indices. Return TRUE if the transliteration should continue * or FALSE if it should halt (because of a U_PARTIAL_MATCH match). * Note that FALSE is only ever returned if isIncremental is TRUE. * @param text the text to be transliterated * @param pos the position indices, which will be updated * @param incremental if TRUE, assume new text may be inserted * at index.limit, and return FALSE if thre is a partial match. * @return TRUE unless a U_PARTIAL_MATCH has been obtained, * indicating that transliteration should stop until more text * arrives. */ UBool TransliterationRuleSet::transliterate(Replaceable& text, UTransPosition& pos, UBool incremental) { int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF); for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) { UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental); switch (m) { case U_MATCH: _debugOut("match", rules[i], text, pos); return TRUE; case U_PARTIAL_MATCH: _debugOut("partial match", rules[i], text, pos); return FALSE; default: /* Ram: added default to make GCC happy */ break; } } // No match or partial match from any rule pos.start += U16_LENGTH(text.char32At(pos.start)); _debugOut("no match", NULL, text, pos); return TRUE; }
static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { UnicodeString nfd; UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode); if(U_FAILURE(errorCode)) { return FALSE; } if(nfcNorm2->getDecomposition(c, nfd)) { /* c has a decomposition */ if(nfd.length()==1) { c=nfd[0]; /* single BMP code point */ } else if(nfd.length()<=U16_MAX_LENGTH && nfd.length()==U16_LENGTH(c=nfd.char32At(0)) ) { /* single supplementary code point */ } else { c=U_SENTINEL; } } else if(c<0) { return FALSE; /* protect against bad input */ } if(c>=0) { /* single code point */ const UCaseProps *csp=ucase_getSingleton(); const UChar *resultString; return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0); } else { /* guess some large but stack-friendly capacity */ UChar dest[2*UCASE_MAX_STRING_LENGTH]; int32_t destLength; destLength=u_strFoldCase(dest, LENGTHOF(dest), nfd.getBuffer(), nfd.length(), U_FOLD_CASE_DEFAULT, &errorCode); return (UBool)(U_SUCCESS(errorCode) && 0!=u_strCompare(nfd.getBuffer(), nfd.length(), dest, destLength, FALSE)); } }
void CasePropsBuilder::addUnfolding(UChar32 c, const UnicodeString &s, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } int32_t length=s.length(); if(length>UGENCASE_UNFOLD_STRING_WIDTH) { fprintf(stderr, "genprops error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n", (long)length, UGENCASE_UNFOLD_STRING_WIDTH); errorCode=U_INTERNAL_PROGRAM_ERROR; } unfold.append(s); while(length<UGENCASE_UNFOLD_STRING_WIDTH) { unfold.append(0); ++length; } unfold.append(c); if(U16_LENGTH(c)<UGENCASE_UNFOLD_CP_WIDTH) { unfold.append(0); } U_ASSERT((unfold.length()%UGENCASE_UNFOLD_WIDTH)==0); }
UChar32 FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == CHECK_BWD) { if(pos == 0) { return U_SENTINEL; } if((c = u8[pos - 1]) < 0x80) { --pos; return c; } U8_PREV_OR_FFFD(u8, 0, pos, c); if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) && (CollationFCD::maybeTibetanCompositeVowel(c) || (pos != 0 && previousHasTccc()))) { // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence // and we can use U8_LENGTH() rather than a previous-position variable. pos += U8_LENGTH(c); if(!previousSegment(errorCode)) { return U_SENTINEL; } continue; } return c; } else if(state == IN_FCD_SEGMENT && pos != start) { U8_PREV_OR_FFFD(u8, 0, pos, c); return c; } else if(state >= IN_NORMALIZED && pos != 0) { c = normalized.char32At(pos - 1); pos -= U16_LENGTH(c); return c; } else { switchToBackward(); } } }
UChar32 FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == CHECK_FWD) { if(pos == length || ((c = u8[pos]) == 0 && length < 0)) { return U_SENTINEL; } if(c < 0x80) { ++pos; return c; } U8_NEXT_OR_FFFD(u8, pos, length, c); if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) && (CollationFCD::maybeTibetanCompositeVowel(c) || (pos != length && nextHasLccc()))) { // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence // and we can use U8_LENGTH() rather than a previous-position variable. pos -= U8_LENGTH(c); if(!nextSegment(errorCode)) { return U_SENTINEL; } continue; } return c; } else if(state == IN_FCD_SEGMENT && pos != limit) { U8_NEXT_OR_FFFD(u8, pos, length, c); return c; } else if(state == IN_NORMALIZED && pos != normalized.length()) { c = normalized.char32At(pos); pos += U16_LENGTH(c); return c; } else { switchToForward(); } } }
void DecimalFormatSymbols::initialize(const Locale& loc, UErrorCode& status, UBool useLastResortData) { if (U_FAILURE(status)) { return; } *validLocale = *actualLocale = 0; currPattern = NULL; // First initialize all the symbols to the fallbacks for anything we can't find initialize(); // // Next get the numbering system for this locale and set zero digit // and the digit string based on the numbering system for the locale // LocalPointer<NumberingSystem> ns(NumberingSystem::createInstance(loc, status)); const char *nsName; if (U_SUCCESS(status) && ns->getRadix() == 10 && !ns->isAlgorithmic()) { nsName = ns->getName(); UnicodeString digitString(ns->getDescription()); int32_t digitIndex = 0; UChar32 digit = digitString.char32At(0); fSymbols[kZeroDigitSymbol].setTo(digit); for (int32_t i = kOneDigitSymbol; i <= kNineDigitSymbol; ++i) { digitIndex += U16_LENGTH(digit); digit = digitString.char32At(digitIndex); fSymbols[i].setTo(digit); } } else { nsName = gLatn; } // Open resource bundles const char* locStr = loc.getName(); LocalUResourceBundlePointer resource(ures_open(NULL, locStr, &status)); LocalUResourceBundlePointer numberElementsRes( ures_getByKeyWithFallback(resource.getAlias(), gNumberElements, NULL, &status)); if (U_FAILURE(status)) { if ( useLastResortData ) { status = U_USING_DEFAULT_WARNING; initialize(); } return; } // Set locale IDs // TODO: Is there a way to do this without depending on the resource bundle instance? U_LOCALE_BASED(locBased, *this); locBased.setLocaleIDs( ures_getLocaleByType( numberElementsRes.getAlias(), ULOC_VALID_LOCALE, &status), ures_getLocaleByType( numberElementsRes.getAlias(), ULOC_ACTUAL_LOCALE, &status)); // Now load the rest of the data from the data sink. // Start with loading this nsName if it is not Latin. DecFmtSymDataSink sink(*this); if (uprv_strcmp(nsName, gLatn) != 0) { CharString path; path.append(gNumberElements, status) .append('/', status) .append(nsName, status) .append('/', status) .append(gSymbols, status); ures_getAllItemsWithFallback(resource.getAlias(), path.data(), sink, status); // If no symbols exist for the given nsName and resource bundle, silently ignore // and fall back to Latin. if (status == U_MISSING_RESOURCE_ERROR) { status = U_ZERO_ERROR; } else if (U_FAILURE(status)) { return; } } // Continue with Latin if necessary. if (!sink.seenAll()) { ures_getAllItemsWithFallback(resource.getAlias(), gNumberElementsLatnSymbols, sink, status); if (U_FAILURE(status)) { return; } } // Let the monetary number separators equal the default number separators if necessary. sink.resolveMissingMonetarySeparators(fSymbols); // Obtain currency data from the currency API. This is strictly // for backward compatibility; we don't use DecimalFormatSymbols // for currency data anymore. UErrorCode internalStatus = U_ZERO_ERROR; // don't propagate failures out UChar curriso[4]; UnicodeString tempStr; ucurr_forLocale(locStr, curriso, 4, &internalStatus); uprv_getStaticCurrencyName(curriso, locStr, tempStr, internalStatus); if (U_SUCCESS(internalStatus)) { fSymbols[kIntlCurrencySymbol].setTo(curriso, -1); fSymbols[kCurrencySymbol] = tempStr; } /* else use the default values. */ //load the currency data UChar ucc[4]={0}; //Currency Codes are always 3 chars long int32_t uccLen = 4; const char* locName = loc.getName(); UErrorCode localStatus = U_ZERO_ERROR; uccLen = ucurr_forLocale(locName, ucc, uccLen, &localStatus); if(U_SUCCESS(localStatus) && uccLen > 0) { char cc[4]={0}; u_UCharsToChars(ucc, cc, uccLen); /* An explicit currency was requested */ LocalUResourceBundlePointer currencyResource(ures_open(U_ICUDATA_CURR, locStr, &localStatus)); LocalUResourceBundlePointer currency( ures_getByKeyWithFallback(currencyResource.getAlias(), "Currencies", NULL, &localStatus)); ures_getByKeyWithFallback(currency.getAlias(), cc, currency.getAlias(), &localStatus); if(U_SUCCESS(localStatus) && ures_getSize(currency.getAlias())>2) { // the length is 3 if more data is present ures_getByIndex(currency.getAlias(), 2, currency.getAlias(), &localStatus); int32_t currPatternLen = 0; currPattern = ures_getStringByIndex(currency.getAlias(), (int32_t)0, &currPatternLen, &localStatus); UnicodeString decimalSep = ures_getUnicodeStringByIndex(currency.getAlias(), (int32_t)1, &localStatus); UnicodeString groupingSep = ures_getUnicodeStringByIndex(currency.getAlias(), (int32_t)2, &localStatus); if(U_SUCCESS(localStatus)){ fSymbols[kMonetaryGroupingSeparatorSymbol] = groupingSep; fSymbols[kMonetarySeparatorSymbol] = decimalSep; //pattern.setTo(TRUE, currPattern, currPatternLen); status = localStatus; } } /* else An explicit currency was requested and is unknown or locale data is malformed. */ /* ucurr_* API will get the correct value later on. */ } // else ignore the error if no currency // Currency Spacing. localStatus = U_ZERO_ERROR; LocalUResourceBundlePointer currencyResource(ures_open(U_ICUDATA_CURR, locStr, &localStatus)); LocalUResourceBundlePointer currencySpcRes( ures_getByKeyWithFallback(currencyResource.getAlias(), gCurrencySpacingTag, NULL, &localStatus)); if (localStatus == U_USING_FALLBACK_WARNING || U_SUCCESS(localStatus)) { const char* keywords[UNUM_CURRENCY_SPACING_COUNT] = { gCurrencyMatchTag, gCurrencySudMatchTag, gCurrencyInsertBtnTag }; localStatus = U_ZERO_ERROR; LocalUResourceBundlePointer dataRes( ures_getByKeyWithFallback(currencySpcRes.getAlias(), gBeforeCurrencyTag, NULL, &localStatus)); if (localStatus == U_USING_FALLBACK_WARNING || U_SUCCESS(localStatus)) { localStatus = U_ZERO_ERROR; for (int32_t i = 0; i < UNUM_CURRENCY_SPACING_COUNT; i++) { currencySpcBeforeSym[i] = ures_getUnicodeStringByKey(dataRes.getAlias(), keywords[i], &localStatus); } } dataRes.adoptInstead( ures_getByKeyWithFallback(currencySpcRes.getAlias(), gAfterCurrencyTag, NULL, &localStatus)); if (localStatus == U_USING_FALLBACK_WARNING || U_SUCCESS(localStatus)) { localStatus = U_ZERO_ERROR; for (int32_t i = 0; i < UNUM_CURRENCY_SPACING_COUNT; i++) { currencySpcAfterSym[i] = ures_getUnicodeStringByKey(dataRes.getAlias(), keywords[i], &localStatus); } } } }
Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); UnicodeString toPut(segment, segLen); fillinResult->put(toPut, new UnicodeString(toPut), status); UnicodeSet starts; // cycle through all the characters UChar32 cp; for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) { // see if any character is at the start of some decomposition U16_GET(segment, 0, i, segLen, cp); if (!nfcImpl.getCanonStartSet(cp, starts)) { continue; } // if so, see which decompositions match UnicodeSetIterator iter(starts); while (iter.next()) { UChar32 cp2 = iter.getCodepoint(); Hashtable remainder(status); remainder.setValueDeleter(uprv_deleteUObject); if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { continue; } // there were some matches, so add all the possibilities to the set. UnicodeString prefix(segment, i); prefix += cp2; int32_t el = UHASH_FIRST; const UHashElement *ne = remainder.nextElement(el); while (ne != NULL) { UnicodeString item = *((UnicodeString *)(ne->value.pointer)); UnicodeString *toAdd = new UnicodeString(prefix); /* test for NULL */ if (toAdd == 0) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } *toAdd += item; fillinResult->put(*toAdd, toAdd, status); //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); ne = remainder.nextElement(el); } } } /* Test for buffer overflows */ if(U_FAILURE(status)) { return NULL; } return fillinResult; }
/** * Dumb recursive implementation of permutation. * TODO: optimize * @param source the string to find permutations for * @return the results in a set. */ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { if(U_FAILURE(status)) { return; } //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); int32_t i = 0; // optimization: // if zero or one character, just return a set with it // we check for length < 2 to keep from counting code points all the time if (source.length() <= 2 && source.countChar32() <= 1) { UnicodeString *toPut = new UnicodeString(source); /* test for NULL */ if (toPut == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } result->put(source, toPut, status); return; } // otherwise iterate through the string, and recursively permute all the other characters UChar32 cp; Hashtable subpermute(status); if(U_FAILURE(status)) { return; } subpermute.setValueDeleter(uprv_deleteUObject); for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { cp = source.char32At(i); const UHashElement *ne = NULL; int32_t el = UHASH_FIRST; UnicodeString subPermuteString = source; // optimization: // if the character is canonical combining class zero, // don't permute it if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); continue; } subpermute.removeAll(); // see what the permutations of the characters before and after this one are //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); /* Test for buffer overflows */ if(U_FAILURE(status)) { return; } // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents // of source at this point. // prefix this character to all of them ne = subpermute.nextElement(el); while (ne != NULL) { UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); UnicodeString *chStr = new UnicodeString(cp); //test for NULL if (chStr == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); result->put(*chStr, chStr, status); ne = subpermute.nextElement(el); } } //return result; }
/** *@param set the source string to iterate against. This allows the same iterator to be used * while changing the source string, saving object creation. */ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { int32_t list_length = 0; UChar32 cp = 0; int32_t start = 0; int32_t i = 0; UnicodeString *list = NULL; nfd.normalize(newSource, source, status); if(U_FAILURE(status)) { return; } done = FALSE; cleanPieces(); // catch degenerate case if (newSource.length() == 0) { pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); pieces_length = 1; current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); current_length = 1; if (pieces == NULL || pieces_lengths == NULL || current == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto CleanPartialInitialization; } current[0] = 0; pieces[0] = new UnicodeString[1]; pieces_lengths[0] = 1; if (pieces[0] == 0) { status = U_MEMORY_ALLOCATION_ERROR; goto CleanPartialInitialization; } return; } list = new UnicodeString[source.length()]; if (list == 0) { status = U_MEMORY_ALLOCATION_ERROR; goto CleanPartialInitialization; } // i should initialy be the number of code units at the // start of the string i = U16_LENGTH(source.char32At(0)); //int32_t i = 1; // find the segments // This code iterates through the source string and // extracts segments that end up on a codepoint that // doesn't start any decompositions. (Analysis is done // on the NFD form - see above). for (; i < source.length(); i += U16_LENGTH(cp)) { cp = source.char32At(i); if (nfcImpl.isCanonSegmentStarter(cp)) { source.extract(start, i-start, list[list_length++]); // add up to i start = i; } } source.extract(start, i-start, list[list_length++]); // add last one // allocate the arrays, and find the strings that are CE to each segment pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); pieces_length = list_length; pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); current_length = list_length; if (pieces == NULL || pieces_lengths == NULL || current == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto CleanPartialInitialization; } for (i = 0; i < current_length; i++) { current[i] = 0; } // for each segment, get all the combinations that can produce // it after NFD normalization for (i = 0; i < pieces_length; ++i) { //if (PROGRESS) printf("SEGMENT\n"); pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); } delete[] list; return; // Common section to cleanup all local variables and reset object variables. CleanPartialInitialization: if (list != NULL) { delete[] list; } cleanPieces(); }
//Tests for new API for utf-16 support void CharIterTest::TestIterationUChar32() { UChar textChars[]={ 0x0061, 0x0062, 0xd841, 0xdc02, 0x20ac, 0xd7ff, 0xd842, 0xdc06, 0xd801, 0xdc00, 0x0061, 0x0000}; UnicodeString text(textChars); UChar32 c; int32_t i; { StringCharacterIterator iter(text, 1); UnicodeString iterText; iter.getText(iterText); if (iterText != text) errln("iter.getText() failed"); if (iter.current32() != text[(int32_t)1]) errln("Iterator didn't start out in the right place."); c=iter.setToStart(); i=0; i=iter.move32(1, CharacterIterator::kStart); c=iter.current32(); if(c != text.char32At(1) || i!=1) errln("move32(1, kStart) didn't work correctly expected %X got %X", c, text.char32At(1) ); i=iter.move32(2, CharacterIterator::kCurrent); c=iter.current32(); if(c != text.char32At(4) || i!=4) errln("move32(2, kCurrent) didn't work correctly expected %X got %X i=%ld", c, text.char32At(4), i); i=iter.move32(-2, CharacterIterator::kCurrent); c=iter.current32(); if(c != text.char32At(1) || i!=1) errln("move32(-2, kCurrent) didn't work correctly expected %X got %X i=%d", c, text.char32At(1), i); i=iter.move32(-2, CharacterIterator::kEnd); c=iter.current32(); if(c != text.char32At((text.length()-3)) || i!=(text.length()-3)) errln("move32(-2, kEnd) didn't work correctly expected %X got %X i=%d", c, text.char32At((text.length()-3)), i); c = iter.first32(); i = 0; if (iter.startIndex() != 0 || iter.endIndex() != text.length()) errln("startIndex() or endIndex() failed"); logln("Testing forward iteration..."); do { /* logln("c=%d i=%d char32At=%d", c, i, text.char32At(i)); */ if (c == CharacterIterator::DONE && i != text.length()) errln("Iterator reached end prematurely"); else if(iter.hasNext() == FALSE && i != text.length()) errln("Iterator reached end prematurely. Failed at hasNext"); else if (c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); if (iter.current32() != c) errln("current32() isn't working right"); if(iter.setIndex32(i) != c) errln("setIndex32() isn't working right"); if (c != CharacterIterator::DONE) { c = iter.next32(); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1; } } while (c != CharacterIterator::DONE); if(iter.hasNext() == TRUE) errln("hasNext() returned true at the end of the string"); c=iter.setToEnd(); if(iter.getIndex() != text.length() || iter.hasNext() != FALSE) errln("setToEnd failed"); c=iter.next32(); if(c!= CharacterIterator::DONE) errln("next32 didn't return DONE at the end"); c=iter.setIndex32(text.length()+1); if(c!= CharacterIterator::DONE) errln("setIndex32(len+1) didn't return DONE"); c = iter.last32(); i = text.length()-1; logln("Testing backward iteration..."); do { if (c == CharacterIterator::DONE && i >= 0) errln((UnicodeString)"Iterator reached start prematurely for i=" + i); else if(iter.hasPrevious() == FALSE && i>0) errln((UnicodeString)"Iterator reached start prematurely for i=" + i); else if (c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); if (iter.current32() != c) errln("current32() isn't working right"); if(iter.setIndex32(i) != c) errln("setIndex32() isn't working right"); if (iter.getIndex() != i) errln("getIndex() isn't working right"); if (c != CharacterIterator::DONE) { c = iter.previous32(); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i-2 : i-1; } } while (c != CharacterIterator::DONE); if(iter.hasPrevious() == TRUE) errln("hasPrevious returned true after reaching the start"); c=iter.previous32(); if(c!= CharacterIterator::DONE) errln("previous32 didn't return DONE at the beginning"); //testing first32PostInc, next32PostInc, setTostart i = 0; c=iter.first32PostInc(); if(c != text.char32At(i)) errln("first32PostInc failed. Expected->%X Got->%X", text.char32At(i), c); if(iter.getIndex() != U16_LENGTH(c) + i) errln((UnicodeString)"getIndex() after first32PostInc() failed"); iter.setToStart(); i=0; if (iter.startIndex() != 0) errln("setToStart failed"); logln("Testing forward iteration..."); do { if (c != CharacterIterator::DONE) c = iter.next32PostInc(); if(c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1; if(iter.getIndex() != i) errln("getIndex() aftr next32PostInc() isn't working right"); if(iter.current32() != text.char32At(i)) errln("current() after next32PostInc() isn't working right"); } while (iter.hasNext()); c=iter.next32PostInc(); if(c!= CharacterIterator::DONE) errln("next32PostInc() didn't return DONE at the beginning"); } { StringCharacterIterator iter(text, 1, 11, 10); if (iter.startIndex() != 1 || iter.endIndex() != 11) errln("creation of a restricted-range iterator failed"); if (iter.getIndex() != 10 || iter.current32() != text.char32At(10)) errln("starting the iterator in the middle didn't work"); c = iter.first32(); i = 1; logln("Testing forward iteration over a range..."); do { if (c == CharacterIterator::DONE && i != 11) errln("Iterator reached end prematurely"); else if(iter.hasNext() == FALSE) errln("Iterator reached end prematurely"); else if (c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); if (iter.current32() != c) errln("current32() isn't working right"); if(iter.setIndex32(i) != c) errln("setIndex32() isn't working right"); if (c != CharacterIterator::DONE) { c = iter.next32(); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1; } } while (c != CharacterIterator::DONE); c=iter.next32(); if(c != CharacterIterator::DONE) errln("error in next32()"); c=iter.last32(); i = 10; logln("Testing backward iteration over a range..."); do { if (c == CharacterIterator::DONE && i >= 5) errln("Iterator reached start prematurely"); else if(iter.hasPrevious() == FALSE && i > 5) errln("Iterator reached start prematurely"); else if (c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); if (iter.current32() != c) errln("current32() isn't working right"); if (iter.getIndex() != i) errln("getIndex() isn't working right"); if(iter.setIndex32(i) != c) errln("setIndex32() isn't working right"); if (c != CharacterIterator::DONE) { c = iter.previous32(); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i-2 : i-1; } } while (c != CharacterIterator::DONE); c=iter.previous32(); if(c!= CharacterIterator::DONE) errln("error on previous32"); } }
U_CFUNC int32_t u_strFromPunycode(const UChar *src, int32_t srcLength, UChar *dest, int32_t destCapacity, UBool *caseFlags, UErrorCode *pErrorCode) { int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, destCPCount, firstSupplementaryIndex, cpLength; UChar b; /* argument checking */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if(srcLength==-1) { srcLength=u_strlen(src); } /* * Handle the basic code points: * Let basicLength be the number of input code points * before the last delimiter, or 0 if there is none, * then copy the first basicLength code points to the output. * * The two following loops iterate backward. */ for(j=srcLength; j>0;) { if(src[--j]==DELIMITER) { break; } } destLength=basicLength=destCPCount=j; U_ASSERT(destLength>=0); while(j>0) { b=src[--j]; if(!IS_BASIC(b)) { *pErrorCode=U_INVALID_CHAR_FOUND; return 0; } if(j<destCapacity) { dest[j]=(UChar)b; if(caseFlags!=NULL) { caseFlags[j]=IS_BASIC_UPPERCASE(b); } } } /* Initialize the state: */ n=INITIAL_N; i=0; bias=INITIAL_BIAS; firstSupplementaryIndex=1000000000; /* * Main decoding loop: * Start just after the last delimiter if any * basic code points were copied; start at the beginning otherwise. */ for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { /* * in is the index of the next character to be consumed, and * destCPCount is the number of code points in the output array. * * Decode a generalized variable-length integer into delta, * which gets added to i. The overflow checking is easier * if we increase i as we go, then subtract off its starting * value at the end to obtain delta. */ for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { if(in>=srcLength) { *pErrorCode=U_ILLEGAL_CHAR_FOUND; return 0; } digit=basicToDigit[(uint8_t)src[in++]]; if(digit<0) { *pErrorCode=U_INVALID_CHAR_FOUND; return 0; } if(digit>(0x7fffffff-i)/w) { /* integer overflow */ *pErrorCode=U_ILLEGAL_CHAR_FOUND; return 0; } i+=digit*w; /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt t=k-bias; if(t<TMIN) { t=TMIN; } else if(t>TMAX) { t=TMAX; } */ t=k-bias; if(t<TMIN) { t=TMIN; } else if(k>=(bias+TMAX)) { t=TMAX; } if(digit<t) { break; } if(w>0x7fffffff/(BASE-t)) { /* integer overflow */ *pErrorCode=U_ILLEGAL_CHAR_FOUND; return 0; } w*=BASE-t; } /* * Modification from sample code: * Increments destCPCount here, * where needed instead of in for() loop tail. */ ++destCPCount; bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0)); /* * i was supposed to wrap around from (incremented) destCPCount to 0, * incrementing n each time, so we'll fix that now: */ if(i/destCPCount>(0x7fffffff-n)) { /* integer overflow */ *pErrorCode=U_ILLEGAL_CHAR_FOUND; return 0; } n+=i/destCPCount; i%=destCPCount; /* not needed for Punycode: */ /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ if(n>0x10ffff || U_IS_SURROGATE(n)) { /* Unicode code point overflow */ *pErrorCode=U_ILLEGAL_CHAR_FOUND; return 0; } /* Insert n at position i of the output: */ cpLength=U16_LENGTH(n); if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) { int32_t codeUnitIndex; /* * Handle indexes when supplementary code points are present. * * In almost all cases, there will be only BMP code points before i * and even in the entire string. * This is handled with the same efficiency as with UTF-32. * * Only the rare cases with supplementary code points are handled * more slowly - but not too bad since this is an insertion anyway. */ if(i<=firstSupplementaryIndex) { codeUnitIndex=i; if(cpLength>1) { firstSupplementaryIndex=codeUnitIndex; } else { ++firstSupplementaryIndex; } } else { codeUnitIndex=firstSupplementaryIndex; U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex); } /* use the UChar index codeUnitIndex instead of the code point index i */ if(codeUnitIndex<destLength) { uprv_memmove(dest+codeUnitIndex+cpLength, dest+codeUnitIndex, (destLength-codeUnitIndex)*U_SIZEOF_UCHAR); if(caseFlags!=NULL) { uprv_memmove(caseFlags+codeUnitIndex+cpLength, caseFlags+codeUnitIndex, destLength-codeUnitIndex); } } if(cpLength==1) { /* BMP, insert one code unit */ dest[codeUnitIndex]=(UChar)n; } else { /* supplementary character, insert two code units */ dest[codeUnitIndex]=U16_LEAD(n); dest[codeUnitIndex+1]=U16_TRAIL(n); } if(caseFlags!=NULL) { /* Case of last character determines uppercase flag: */ caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]); if(cpLength==2) { caseFlags[codeUnitIndex+1]=FALSE; } } } destLength+=cpLength; U_ASSERT(destLength>=0); ++i; } return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); }
static inline int32_t posBefore(const Replaceable& str, int32_t pos) { return (pos > 0) ? pos - U16_LENGTH(str.char32At(pos-1)) : pos - 1; }
void DecimalFormatSymbols::initialize(const Locale& loc, UErrorCode& status, UBool useLastResortData) { static const char *gNumberElementKeys[kFormatSymbolCount] = { "decimal", "group", "list", "percentSign", NULL, /* Native zero digit is deprecated from CLDR - get it from the numbering system */ NULL, /* Pattern digit character is deprecated from CLDR - use # by default always */ "minusSign", "plusSign", NULL, /* currency symbol - We don't really try to load this directly from CLDR until we know the currency */ NULL, /* intl currency symbol - We don't really try to load this directly from CLDR until we know the currency */ "currencyDecimal", "exponential", "perMille", NULL, /* Escape padding character - not in CLDR */ "infinity", "nan", NULL, /* Significant digit symbol - not in CLDR */ "currencyGroup", NULL, /* one digit - get it from the numbering system */ NULL, /* two digit - get it from the numbering system */ NULL, /* three digit - get it from the numbering system */ NULL, /* four digit - get it from the numbering system */ NULL, /* five digit - get it from the numbering system */ NULL, /* six digit - get it from the numbering system */ NULL, /* seven digit - get it from the numbering system */ NULL, /* eight digit - get it from the numbering system */ NULL, /* nine digit - get it from the numbering system */ "superscriptingExponent", /* Multiplication (x) symbol for exponents */ }; static const char *gLatn = "latn"; static const char *gSymbols = "symbols"; const char *nsName; const UChar *sym = NULL; int32_t len = 0; *validLocale = *actualLocale = 0; currPattern = NULL; if (U_FAILURE(status)) return; const char* locStr = loc.getName(); LocalUResourceBundlePointer resource(ures_open(NULL, locStr, &status)); LocalUResourceBundlePointer numberElementsRes( ures_getByKeyWithFallback(resource.getAlias(), gNumberElements, NULL, &status)); if (U_FAILURE(status)) { if ( useLastResortData ) { status = U_USING_DEFAULT_WARNING; initialize(); } return; } // First initialize all the symbols to the fallbacks for anything we can't find initialize(); // // Next get the numbering system for this locale and set zero digit // and the digit string based on the numbering system for the locale // LocalPointer<NumberingSystem> ns(NumberingSystem::createInstance(loc, status)); if (U_SUCCESS(status) && ns->getRadix() == 10 && !ns->isAlgorithmic()) { nsName = ns->getName(); UnicodeString digitString(ns->getDescription()); int32_t digitIndex = 0; UChar32 digit = digitString.char32At(0); fSymbols[kZeroDigitSymbol].setTo(digit); for (int32_t i = kOneDigitSymbol; i <= kNineDigitSymbol; ++i) { digitIndex += U16_LENGTH(digit); digit = digitString.char32At(digitIndex); fSymbols[i].setTo(digit); } } else { nsName = gLatn; } UBool isLatn = !uprv_strcmp(nsName,gLatn); UErrorCode nlStatus = U_ZERO_ERROR; LocalUResourceBundlePointer nonLatnSymbols; if ( !isLatn ) { nonLatnSymbols.adoptInstead( ures_getByKeyWithFallback(numberElementsRes.getAlias(), nsName, NULL, &nlStatus)); ures_getByKeyWithFallback(nonLatnSymbols.getAlias(), gSymbols, nonLatnSymbols.getAlias(), &nlStatus); } LocalUResourceBundlePointer latnSymbols( ures_getByKeyWithFallback(numberElementsRes.getAlias(), gLatn, NULL, &status)); ures_getByKeyWithFallback(latnSymbols.getAlias(), gSymbols, latnSymbols.getAlias(), &status); UBool kMonetaryDecimalSet = FALSE; UBool kMonetaryGroupingSet = FALSE; for(int32_t i = 0; i<kFormatSymbolCount; i++) { if ( gNumberElementKeys[i] != NULL ) { UErrorCode localStatus = U_ZERO_ERROR; if ( !isLatn ) { sym = ures_getStringByKeyWithFallback(nonLatnSymbols.getAlias(), gNumberElementKeys[i], &len, &localStatus); // If we can't find the symbol in the numbering system specific resources, // use the "latn" numbering system as the fallback. if ( U_FAILURE(localStatus) ) { localStatus = U_ZERO_ERROR; sym = ures_getStringByKeyWithFallback(latnSymbols.getAlias(), gNumberElementKeys[i], &len, &localStatus); } } else { sym = ures_getStringByKeyWithFallback(latnSymbols.getAlias(), gNumberElementKeys[i], &len, &localStatus); } if ( U_SUCCESS(localStatus) ) { setSymbol((ENumberFormatSymbol)i, UnicodeString(TRUE, sym, len)); if ( i == kMonetarySeparatorSymbol ) { kMonetaryDecimalSet = TRUE; } else if ( i == kMonetaryGroupingSeparatorSymbol ) { kMonetaryGroupingSet = TRUE; } } } } // If monetary decimal or grouping were not explicitly set, then set them to be the // same as their non-monetary counterparts. if ( !kMonetaryDecimalSet ) { setSymbol(kMonetarySeparatorSymbol,fSymbols[kDecimalSeparatorSymbol]); } if ( !kMonetaryGroupingSet ) { setSymbol(kMonetaryGroupingSeparatorSymbol,fSymbols[kGroupingSeparatorSymbol]); } // Obtain currency data from the currency API. This is strictly // for backward compatibility; we don't use DecimalFormatSymbols // for currency data anymore. UErrorCode internalStatus = U_ZERO_ERROR; // don't propagate failures out UChar curriso[4]; UnicodeString tempStr; ucurr_forLocale(locStr, curriso, 4, &internalStatus); uprv_getStaticCurrencyName(curriso, locStr, tempStr, internalStatus); if (U_SUCCESS(internalStatus)) { fSymbols[kIntlCurrencySymbol].setTo(curriso, -1); fSymbols[kCurrencySymbol] = tempStr; } /* else use the default values. */ U_LOCALE_BASED(locBased, *this); locBased.setLocaleIDs(ures_getLocaleByType(numberElementsRes.getAlias(), ULOC_VALID_LOCALE, &status), ures_getLocaleByType(numberElementsRes.getAlias(), ULOC_ACTUAL_LOCALE, &status)); //load the currency data UChar ucc[4]={0}; //Currency Codes are always 3 chars long int32_t uccLen = 4; const char* locName = loc.getName(); UErrorCode localStatus = U_ZERO_ERROR; uccLen = ucurr_forLocale(locName, ucc, uccLen, &localStatus); if(U_SUCCESS(localStatus) && uccLen > 0) { char cc[4]={0}; u_UCharsToChars(ucc, cc, uccLen); /* An explicit currency was requested */ LocalUResourceBundlePointer currencyResource(ures_open(U_ICUDATA_CURR, locStr, &localStatus)); LocalUResourceBundlePointer currency( ures_getByKeyWithFallback(currencyResource.getAlias(), "Currencies", NULL, &localStatus)); ures_getByKeyWithFallback(currency.getAlias(), cc, currency.getAlias(), &localStatus); if(U_SUCCESS(localStatus) && ures_getSize(currency.getAlias())>2) { // the length is 3 if more data is present ures_getByIndex(currency.getAlias(), 2, currency.getAlias(), &localStatus); int32_t currPatternLen = 0; currPattern = ures_getStringByIndex(currency.getAlias(), (int32_t)0, &currPatternLen, &localStatus); UnicodeString decimalSep = ures_getUnicodeStringByIndex(currency.getAlias(), (int32_t)1, &localStatus); UnicodeString groupingSep = ures_getUnicodeStringByIndex(currency.getAlias(), (int32_t)2, &localStatus); if(U_SUCCESS(localStatus)){ fSymbols[kMonetaryGroupingSeparatorSymbol] = groupingSep; fSymbols[kMonetarySeparatorSymbol] = decimalSep; //pattern.setTo(TRUE, currPattern, currPatternLen); status = localStatus; } } /* else An explicit currency was requested and is unknown or locale data is malformed. */ /* ucurr_* API will get the correct value later on. */ } // else ignore the error if no currency // Currency Spacing. localStatus = U_ZERO_ERROR; LocalUResourceBundlePointer currencyResource(ures_open(U_ICUDATA_CURR, locStr, &localStatus)); LocalUResourceBundlePointer currencySpcRes( ures_getByKeyWithFallback(currencyResource.getAlias(), gCurrencySpacingTag, NULL, &localStatus)); if (localStatus == U_USING_FALLBACK_WARNING || U_SUCCESS(localStatus)) { const char* keywords[UNUM_CURRENCY_SPACING_COUNT] = { gCurrencyMatchTag, gCurrencySudMatchTag, gCurrencyInsertBtnTag }; localStatus = U_ZERO_ERROR; LocalUResourceBundlePointer dataRes( ures_getByKeyWithFallback(currencySpcRes.getAlias(), gBeforeCurrencyTag, NULL, &localStatus)); if (localStatus == U_USING_FALLBACK_WARNING || U_SUCCESS(localStatus)) { localStatus = U_ZERO_ERROR; for (int32_t i = 0; i < UNUM_CURRENCY_SPACING_COUNT; i++) { currencySpcBeforeSym[i] = ures_getUnicodeStringByKey(dataRes.getAlias(), keywords[i], &localStatus); } } dataRes.adoptInstead( ures_getByKeyWithFallback(currencySpcRes.getAlias(), gAfterCurrencyTag, NULL, &localStatus)); if (localStatus == U_USING_FALLBACK_WARNING || U_SUCCESS(localStatus)) { localStatus = U_ZERO_ERROR; for (int32_t i = 0; i < UNUM_CURRENCY_SPACING_COUNT; i++) { currencySpcAfterSym[i] = ures_getUnicodeStringByKey(dataRes.getAlias(), keywords[i], &localStatus); } } } }
// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset // if non-null. Size is returned in an out parameter because gtest needs a void // return for ASSERT to work. void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size, size_t* offset) { size_t input_ix = 0; size_t output_ix = 0; bool seen_offset = false; while (src[input_ix] != 0) { switch (src[input_ix]) { case '\'': // single ASCII char LOG_ALWAYS_FATAL_IF(static_cast<uint8_t>(src[input_ix]) >= 0x80); input_ix++; LOG_ALWAYS_FATAL_IF(src[input_ix] == 0); LOG_ALWAYS_FATAL_IF(output_ix >= buf_size); buf[output_ix++] = (uint16_t)src[input_ix++]; LOG_ALWAYS_FATAL_IF(src[input_ix] != '\''); input_ix++; break; case 'u': case 'U': { // Unicode codepoint in hex syntax input_ix++; LOG_ALWAYS_FATAL_IF(src[input_ix] != '+'); input_ix++; char* endptr = (char*)src + input_ix; unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16); size_t num_hex_digits = endptr - (src + input_ix); // also triggers on invalid number syntax, digits = 0 LOG_ALWAYS_FATAL_IF(num_hex_digits < 4u); LOG_ALWAYS_FATAL_IF(num_hex_digits > 6u); LOG_ALWAYS_FATAL_IF(codepoint > 0x10FFFFu); input_ix += num_hex_digits; if (U16_LENGTH(codepoint) == 1) { LOG_ALWAYS_FATAL_IF(output_ix + 1 > buf_size); buf[output_ix++] = codepoint; } else { // UTF-16 encoding LOG_ALWAYS_FATAL_IF(output_ix + 2 > buf_size); buf[output_ix++] = U16_LEAD(codepoint); buf[output_ix++] = U16_TRAIL(codepoint); } break; } case ' ': input_ix++; break; case '|': LOG_ALWAYS_FATAL_IF(seen_offset); LOG_ALWAYS_FATAL_IF(offset == nullptr); *offset = output_ix; seen_offset = true; input_ix++; break; default: LOG_ALWAYS_FATAL("Unexpected Character"); } } LOG_ALWAYS_FATAL_IF(result_size == nullptr); *result_size = output_ix; LOG_ALWAYS_FATAL_IF(!seen_offset && offset != nullptr); }
static uint32_t getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { uint8_t *bytes, *resultBytes; uint32_t value; int32_t u16Length, ratio; if(m->f==2) { /* * no mapping, <subchar1> preferred * * no need to count in statistics because the subchars are already * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData, * and this non-mapping does not count for maxInUChars which are always * trivially at least two if counting unmappable supplementary code points */ return UCNV_EXT_FROM_U_SUBCHAR1; } bytes=UCM_GET_BYTES(table, m); value=0; switch(m->bLen) { /* 1..3: store the bytes in the value word */ case 3: value=((uint32_t)*bytes++)<<16; case 2: value|=((uint32_t)*bytes++)<<8; case 1: value|=*bytes; break; default: /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */ /* store the bytes in fromUBytes[] and the index in the value word */ value=(uint32_t)utm_countItems(extData->fromUBytes); resultBytes=utm_allocN(extData->fromUBytes, m->bLen); uprv_memcpy(resultBytes, bytes, m->bLen); break; } value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT; if(m->f==0) { value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; } /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */ if(m->uLen==1) { u16Length=U16_LENGTH(m->u); } else { u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2); } /* update statistics */ if(u16Length>extData->maxInUChars) { extData->maxInUChars=u16Length; } if(m->bLen>extData->maxOutBytes) { extData->maxOutBytes=m->bLen; } ratio=(m->bLen+(u16Length-1))/u16Length; if(ratio>extData->maxBytesPerUChar) { extData->maxBytesPerUChar=ratio; } return value; }
static int32_t unorm_iterate(UCharIterator *src, UBool forward, UChar *dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UBool doNormalize, UBool *pNeededToNormalize, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); const UnicodeSet *uni32; if(options&UNORM_UNICODE_3_2) { uni32=uniset_getUnicode32Instance(*pErrorCode); } else { uni32=NULL; // unused } if(U_FAILURE(*pErrorCode)) { return 0; } FilteredNormalizer2 fn2(*n2, *uni32); if(options&UNORM_UNICODE_3_2) { n2=&fn2; } if( destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if(pNeededToNormalize!=NULL) { *pNeededToNormalize=FALSE; } if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { return u_terminateUChars(dest, destCapacity, 0, pErrorCode); } UnicodeString buffer; UChar32 c; if(forward) { /* get one character and ignore its properties */ buffer.append(uiter_next32(src)); /* get all following characters until we see a boundary */ while((c=uiter_next32(src))>=0) { if(n2->hasBoundaryBefore(c)) { /* back out the latest movement to stop at the boundary */ src->move(src, -U16_LENGTH(c), UITER_CURRENT); break; } else { buffer.append(c); } } } else { while((c=uiter_previous32(src))>=0) { /* always write this character to the front of the buffer */ buffer.insert(0, c); /* stop if this just-copied character is a boundary */ if(n2->hasBoundaryBefore(c)) { break; } } } UnicodeString destString(dest, 0, destCapacity); if(buffer.length()>0 && doNormalize) { n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { *pNeededToNormalize= destString!=buffer; } return destString.length(); } else { /* just copy the source characters */ return buffer.extract(dest, destCapacity, *pErrorCode); } }
static inline int32_t posAfter(const Replaceable& str, int32_t pos) { return (pos >= 0 && pos < str.length()) ? pos + U16_LENGTH(str.char32At(pos)) : pos + 1; }
/** * Implements {@link Transliterator#handleTransliterate}. */ void TitlecaseTransliterator::handleTransliterate( Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // TODO reimplement, see ustrcase.c // using a real word break iterator // instead of just looking for a transition between cased and uncased characters // call CaseMapTransliterator::handleTransliterate() for lowercasing? (set fMap) // needs to take isIncremental into account because case mappings are context-sensitive // also detect when lowercasing function did not finish because of context if (offsets.start >= offsets.limit) { return; } // case type: >0 cased (UCASE_LOWER etc.) ==0 uncased <0 case-ignorable int32_t type; // Our mode; we are either converting letter toTitle or // toLower. UBool doTitle = TRUE; // Determine if there is a preceding context of cased case-ignorable*, // in which case we want to start in toLower mode. If the // prior context is anything else (including empty) then start // in toTitle mode. UChar32 c; int32_t start; for (start = offsets.start - 1; start >= offsets.contextStart; start -= U16_LENGTH(c)) { c = text.char32At(start); type=ucase_getTypeOrIgnorable(fCsp, c); if(type>0) { // cased doTitle=FALSE; break; } else if(type==0) { // uncased but not ignorable break; } // else (type<0) case-ignorable: continue } // Convert things after a cased character toLower; things // after an uncased, non-case-ignorable character toTitle. Case-ignorable // characters are copied directly and do not change the mode. UCaseContext csc; uprv_memset(&csc, 0, sizeof(csc)); csc.p = &text; csc.start = offsets.contextStart; csc.limit = offsets.contextLimit; UnicodeString tmp; const UChar *s; int32_t textPos, delta, result, locCache=0; for(textPos=offsets.start; textPos<offsets.limit;) { csc.cpStart=textPos; c=text.char32At(textPos); csc.cpLimit=textPos+=U16_LENGTH(c); type=ucase_getTypeOrIgnorable(fCsp, c); if(type>=0) { // not case-ignorable if(doTitle) { result=ucase_toFullTitle(fCsp, c, utrans_rep_caseContextIterator, &csc, &s, "", &locCache); } else { result=ucase_toFullLower(fCsp, c, utrans_rep_caseContextIterator, &csc, &s, "", &locCache); } doTitle = (UBool)(type==0); // doTitle=isUncased if(csc.b1 && isIncremental) { // fMap() tried to look beyond the context limit // wait for more input offsets.start=csc.cpStart; return; } if(result>=0) { // replace the current code point with its full case mapping result // see UCASE_MAX_STRING_LENGTH if(result<=UCASE_MAX_STRING_LENGTH) { // string s[result] tmp.setTo(FALSE, s, result); delta=result-U16_LENGTH(c); } else { // single code point tmp.setTo(result); delta=tmp.length()-U16_LENGTH(c); } text.handleReplaceBetween(csc.cpStart, textPos, tmp); if(delta!=0) { textPos+=delta; csc.limit=offsets.contextLimit+=delta; offsets.limit+=delta; } } } } offsets.start=textPos; }
extern void storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, UStringPrepType type, UErrorCode* status){ UChar* map = NULL; int16_t adjustedLen=0, i, j; uint16_t trieWord = 0; ValueStruct *value = NULL; uint32_t savedTrieWord = 0; /* initialize the hashtable */ if(hashTable==NULL){ hashTable = uhash_open(hashEntry, compareEntries, NULL, status); uhash_setValueDeleter(hashTable, valueDeleter); } /* figure out if the code point has type already stored */ savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); if(savedTrieWord!=0){ if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ /* turn on the first bit in trie word */ trieWord += 0x01; }else{ /* * the codepoint has value something other than prohibited * and a mapping .. error! */ fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); exit(U_ILLEGAL_ARGUMENT_ERROR); } } /* figure out the real length */ for(i=0; i<length; i++){ adjustedLen += U16_LENGTH(mapping[i]); } if(adjustedLen == 0){ trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); /* make sure that the value of trieWord is less than the threshold */ if(trieWord < _SPREP_TYPE_THRESHOLD){ /* now set the value in the trie */ if(!utrie_set32(sprepTrie,codepoint,trieWord)){ fprintf(stderr,"Could not set the value for code point.\n"); exit(U_ILLEGAL_ARGUMENT_ERROR); } /* value is set so just return */ return; }else{ fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); exit(U_ILLEGAL_CHAR_FOUND); } } if(adjustedLen == 1){ /* calculate the delta */ int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]); if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){ trieWord = delta << 2; /* make sure that the second bit is OFF */ if((trieWord & 0x02) != 0 ){ fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); exit(U_INTERNAL_PROGRAM_ERROR); } /* make sure that the value of trieWord is less than the threshold */ if(trieWord < _SPREP_TYPE_THRESHOLD){ /* now set the value in the trie */ if(!utrie_set32(sprepTrie,codepoint,trieWord)){ fprintf(stderr,"Could not set the value for code point.\n"); exit(U_ILLEGAL_ARGUMENT_ERROR); } /* value is set so just return */ return; } } /* * if the delta is not in the given range or if the trieWord is larger than the threshold * just fall through for storing the mapping in the mapping table */ } map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR); for (i=0, j=0; i<length; i++) { U16_APPEND_UNSAFE(map, j, mapping[i]); } value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct)); value->mapping = map; value->type = type; value->length = adjustedLen; if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ mappingDataCapacity++; } if(maxLength < value->length){ maxLength = value->length; } uhash_iput(hashTable,codepoint,value,status); mappingDataCapacity += adjustedLen; if(U_FAILURE(*status)){ fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status)); exit(*status); } }
void DecimalFormatPatternParser::applyPatternWithoutExpandAffix( const UnicodeString& pattern, DecimalFormatPattern& out, UParseError& parseError, UErrorCode& status) { if (U_FAILURE(status)) { return; } out = DecimalFormatPattern(); // Clear error struct parseError.offset = -1; parseError.preContext[0] = parseError.postContext[0] = (UChar)0; // TODO: Travis Keep: This won't always work. UChar nineDigit = (UChar)(fZeroDigit + 9); int32_t digitLen = fDigit.length(); int32_t groupSepLen = fGroupingSeparator.length(); int32_t decimalSepLen = fDecimalSeparator.length(); int32_t pos = 0; int32_t patLen = pattern.length(); // Part 0 is the positive pattern. Part 1, if present, is the negative // pattern. for (int32_t part=0; part<2 && pos<patLen; ++part) { // The subpart ranges from 0 to 4: 0=pattern proper, 1=prefix, // 2=suffix, 3=prefix in quote, 4=suffix in quote. Subpart 0 is // between the prefix and suffix, and consists of pattern // characters. In the prefix and suffix, percent, perMill, and // currency symbols are recognized and translated. int32_t subpart = 1, sub0Start = 0, sub0Limit = 0, sub2Limit = 0; // It's important that we don't change any fields of this object // prematurely. We set the following variables for the multiplier, // grouping, etc., and then only change the actual object fields if // everything parses correctly. This also lets us register // the data from part 0 and ignore the part 1, except for the // prefix and suffix. UnicodeString prefix; UnicodeString suffix; int32_t decimalPos = -1; int32_t multiplier = 1; int32_t digitLeftCount = 0, zeroDigitCount = 0, digitRightCount = 0, sigDigitCount = 0; int8_t groupingCount = -1; int8_t groupingCount2 = -1; int32_t padPos = -1; UChar32 padChar = 0; int32_t roundingPos = -1; DigitList roundingInc; int8_t expDigits = -1; UBool expSignAlways = FALSE; // The affix is either the prefix or the suffix. UnicodeString* affix = &prefix; int32_t start = pos; UBool isPartDone = FALSE; UChar32 ch; for (; !isPartDone && pos < patLen; ) { // Todo: account for surrogate pairs ch = pattern.char32At(pos); switch (subpart) { case 0: // Pattern proper subpart (between prefix & suffix) // Process the digits, decimal, and grouping characters. We // record five pieces of information. We expect the digits // to occur in the pattern ####00.00####, and we record the // number of left digits, zero (central) digits, and right // digits. The position of the last grouping character is // recorded (should be somewhere within the first two blocks // of characters), as is the position of the decimal point, // if any (should be in the zero digits). If there is no // decimal point, then there should be no right digits. if (pattern.compare(pos, digitLen, fDigit) == 0) { if (zeroDigitCount > 0 || sigDigitCount > 0) { ++digitRightCount; } else { ++digitLeftCount; } if (groupingCount >= 0 && decimalPos < 0) { ++groupingCount; } pos += digitLen; } else if ((ch >= fZeroDigit && ch <= nineDigit) || ch == fSigDigit) { if (digitRightCount > 0) { // Unexpected '0' debug("Unexpected '0'") status = U_UNEXPECTED_TOKEN; syntaxError(pattern,pos,parseError); return; } if (ch == fSigDigit) { ++sigDigitCount; } else { if (ch != fZeroDigit && roundingPos < 0) { roundingPos = digitLeftCount + zeroDigitCount; } if (roundingPos >= 0) { roundingInc.append((char)(ch - fZeroDigit + '0')); } ++zeroDigitCount; } if (groupingCount >= 0 && decimalPos < 0) { ++groupingCount; } pos += U16_LENGTH(ch); } else if (pattern.compare(pos, groupSepLen, fGroupingSeparator) == 0) { if (decimalPos >= 0) { // Grouping separator after decimal debug("Grouping separator after decimal") status = U_UNEXPECTED_TOKEN; syntaxError(pattern,pos,parseError); return; } groupingCount2 = groupingCount; groupingCount = 0; pos += groupSepLen; } else if (pattern.compare(pos, decimalSepLen, fDecimalSeparator) == 0) { if (decimalPos >= 0) { // Multiple decimal separators debug("Multiple decimal separators") status = U_MULTIPLE_DECIMAL_SEPARATORS; syntaxError(pattern,pos,parseError); return; } // Intentionally incorporate the digitRightCount, // even though it is illegal for this to be > 0 // at this point. We check pattern syntax below. decimalPos = digitLeftCount + zeroDigitCount + digitRightCount; pos += decimalSepLen; } else { if (pattern.compare(pos, fExponent.length(), fExponent) == 0) { if (expDigits >= 0) { // Multiple exponential symbols debug("Multiple exponential symbols") status = U_MULTIPLE_EXPONENTIAL_SYMBOLS; syntaxError(pattern,pos,parseError); return; } if (groupingCount >= 0) { // Grouping separator in exponential pattern debug("Grouping separator in exponential pattern") status = U_MALFORMED_EXPONENTIAL_PATTERN; syntaxError(pattern,pos,parseError); return; } pos += fExponent.length(); // Check for positive prefix if (pos < patLen && pattern.compare(pos, fPlus.length(), fPlus) == 0) { expSignAlways = TRUE; pos += fPlus.length(); } // Use lookahead to parse out the exponential part of the // pattern, then jump into suffix subpart. expDigits = 0; while (pos < patLen && pattern.char32At(pos) == fZeroDigit) { ++expDigits; pos += U16_LENGTH(fZeroDigit); } // 1. Require at least one mantissa pattern digit // 2. Disallow "#+ @" in mantissa // 3. Require at least one exponent pattern digit if (((digitLeftCount + zeroDigitCount) < 1 && (sigDigitCount + digitRightCount) < 1) || (sigDigitCount > 0 && digitLeftCount > 0) || expDigits < 1) { // Malformed exponential pattern debug("Malformed exponential pattern") status = U_MALFORMED_EXPONENTIAL_PATTERN; syntaxError(pattern,pos,parseError); return; } } // Transition to suffix subpart subpart = 2; // suffix subpart affix = &suffix; sub0Limit = pos; continue; } break; case 1: // Prefix subpart case 2: // Suffix subpart // Process the prefix / suffix characters // Process unquoted characters seen in prefix or suffix // subpart. // Several syntax characters implicitly begins the // next subpart if we are in the prefix; otherwise // they are illegal if unquoted. if (!pattern.compare(pos, digitLen, fDigit) || !pattern.compare(pos, groupSepLen, fGroupingSeparator) || !pattern.compare(pos, decimalSepLen, fDecimalSeparator) || (ch >= fZeroDigit && ch <= nineDigit) || ch == fSigDigit) { if (subpart == 1) { // prefix subpart subpart = 0; // pattern proper subpart sub0Start = pos; // Reprocess this character continue; } else { status = U_UNQUOTED_SPECIAL; syntaxError(pattern,pos,parseError); return; } } else if (ch == kCurrencySign) { affix->append(kQuote); // Encode currency // Use lookahead to determine if the currency sign is // doubled or not. U_ASSERT(U16_LENGTH(kCurrencySign) == 1); if ((pos+1) < pattern.length() && pattern[pos+1] == kCurrencySign) { affix->append(kCurrencySign); ++pos; // Skip over the doubled character if ((pos+1) < pattern.length() && pattern[pos+1] == kCurrencySign) { affix->append(kCurrencySign); ++pos; // Skip over the doubled character out.fCurrencySignCount = fgCurrencySignCountInPluralFormat; } else { out.fCurrencySignCount = fgCurrencySignCountInISOFormat; } } else { out.fCurrencySignCount = fgCurrencySignCountInSymbolFormat; } // Fall through to append(ch) } else if (ch == kQuote) { // A quote outside quotes indicates either the opening // quote or two quotes, which is a quote literal. That is, // we have the first quote in 'do' or o''clock. U_ASSERT(U16_LENGTH(kQuote) == 1); ++pos; if (pos < pattern.length() && pattern[pos] == kQuote) { affix->append(kQuote); // Encode quote // Fall through to append(ch) } else { subpart += 2; // open quote continue; } } else if (pattern.compare(pos, fSeparator.length(), fSeparator) == 0) { // Don't allow separators in the prefix, and don't allow // separators in the second pattern (part == 1). if (subpart == 1 || part == 1) { // Unexpected separator debug("Unexpected separator") status = U_UNEXPECTED_TOKEN; syntaxError(pattern,pos,parseError); return; } sub2Limit = pos; isPartDone = TRUE; // Go to next part pos += fSeparator.length(); break; } else if (pattern.compare(pos, fPercent.length(), fPercent) == 0) { // Next handle characters which are appended directly. if (multiplier != 1) { // Too many percent/perMill characters debug("Too many percent characters") status = U_MULTIPLE_PERCENT_SYMBOLS; syntaxError(pattern,pos,parseError); return; } affix->append(kQuote); // Encode percent/perMill affix->append(kPatternPercent); // Use unlocalized pattern char multiplier = 100; pos += fPercent.length(); break; } else if (pattern.compare(pos, fPerMill.length(), fPerMill) == 0) { // Next handle characters which are appended directly. if (multiplier != 1) { // Too many percent/perMill characters debug("Too many perMill characters") status = U_MULTIPLE_PERMILL_SYMBOLS; syntaxError(pattern,pos,parseError); return; } affix->append(kQuote); // Encode percent/perMill affix->append(kPatternPerMill); // Use unlocalized pattern char multiplier = 1000; pos += fPerMill.length(); break; } else if (pattern.compare(pos, fPadEscape.length(), fPadEscape) == 0) { if (padPos >= 0 || // Multiple pad specifiers (pos+1) == pattern.length()) { // Nothing after padEscape debug("Multiple pad specifiers") status = U_MULTIPLE_PAD_SPECIFIERS; syntaxError(pattern,pos,parseError); return; } padPos = pos; pos += fPadEscape.length(); padChar = pattern.char32At(pos); pos += U16_LENGTH(padChar); break; } else if (pattern.compare(pos, fMinus.length(), fMinus) == 0) { affix->append(kQuote); // Encode minus affix->append(kPatternMinus); pos += fMinus.length(); break; } else if (pattern.compare(pos, fPlus.length(), fPlus) == 0) { affix->append(kQuote); // Encode plus affix->append(kPatternPlus); pos += fPlus.length(); break; } // Unquoted, non-special characters fall through to here, as // well as other code which needs to append something to the // affix. affix->append(ch); pos += U16_LENGTH(ch); break; case 3: // Prefix subpart, in quote case 4: // Suffix subpart, in quote // A quote within quotes indicates either the closing // quote or two quotes, which is a quote literal. That is, // we have the second quote in 'do' or 'don''t'. if (ch == kQuote) { ++pos; if (pos < pattern.length() && pattern[pos] == kQuote) { affix->append(kQuote); // Encode quote // Fall through to append(ch) } else { subpart -= 2; // close quote continue; } } affix->append(ch); pos += U16_LENGTH(ch); break; } } if (sub0Limit == 0) { sub0Limit = pattern.length(); } if (sub2Limit == 0) { sub2Limit = pattern.length(); } /* Handle patterns with no '0' pattern character. These patterns * are legal, but must be recodified to make sense. "##.###" -> * "#0.###". ".###" -> ".0##". * * We allow patterns of the form "####" to produce a zeroDigitCount * of zero (got that?); although this seems like it might make it * possible for format() to produce empty strings, format() checks * for this condition and outputs a zero digit in this situation. * Having a zeroDigitCount of zero yields a minimum integer digits * of zero, which allows proper round-trip patterns. We don't want * "#" to become "#0" when toPattern() is called (even though that's * what it really is, semantically). */ if (zeroDigitCount == 0 && sigDigitCount == 0 && digitLeftCount > 0 && decimalPos >= 0) { // Handle "###.###" and "###." and ".###" int n = decimalPos; if (n == 0) ++n; // Handle ".###" digitRightCount = digitLeftCount - n; digitLeftCount = n - 1; zeroDigitCount = 1; } // Do syntax checking on the digits, decimal points, and quotes. if ((decimalPos < 0 && digitRightCount > 0 && sigDigitCount == 0) || (decimalPos >= 0 && (sigDigitCount > 0 || decimalPos < digitLeftCount || decimalPos > (digitLeftCount + zeroDigitCount))) || groupingCount == 0 || groupingCount2 == 0 || (sigDigitCount > 0 && zeroDigitCount > 0) || subpart > 2) { // subpart > 2 == unmatched quote debug("Syntax error") status = U_PATTERN_SYNTAX_ERROR; syntaxError(pattern,pos,parseError); return; } // Make sure pad is at legal position before or after affix. if (padPos >= 0) { if (padPos == start) { padPos = DecimalFormatPattern::kPadBeforePrefix; } else if (padPos+2 == sub0Start) { padPos = DecimalFormatPattern::kPadAfterPrefix; } else if (padPos == sub0Limit) { padPos = DecimalFormatPattern::kPadBeforeSuffix; } else if (padPos+2 == sub2Limit) { padPos = DecimalFormatPattern::kPadAfterSuffix; } else { // Illegal pad position debug("Illegal pad position") status = U_ILLEGAL_PAD_POSITION; syntaxError(pattern,pos,parseError); return; } } if (part == 0) { out.fPosPatternsBogus = FALSE; out.fPosPrefixPattern = prefix; out.fPosSuffixPattern = suffix; out.fNegPatternsBogus = TRUE; out.fNegPrefixPattern.remove(); out.fNegSuffixPattern.remove(); out.fUseExponentialNotation = (expDigits >= 0); if (out.fUseExponentialNotation) { out.fMinExponentDigits = expDigits; } out.fExponentSignAlwaysShown = expSignAlways; int32_t digitTotalCount = digitLeftCount + zeroDigitCount + digitRightCount; // The effectiveDecimalPos is the position the decimal is at or // would be at if there is no decimal. Note that if // decimalPos<0, then digitTotalCount == digitLeftCount + // zeroDigitCount. int32_t effectiveDecimalPos = decimalPos >= 0 ? decimalPos : digitTotalCount; UBool isSigDig = (sigDigitCount > 0); out.fUseSignificantDigits = isSigDig; if (isSigDig) { out.fMinimumSignificantDigits = sigDigitCount; out.fMaximumSignificantDigits = sigDigitCount + digitRightCount; } else { int32_t minInt = effectiveDecimalPos - digitLeftCount; out.fMinimumIntegerDigits = minInt; out.fMaximumIntegerDigits = out.fUseExponentialNotation ? digitLeftCount + out.fMinimumIntegerDigits : gDefaultMaxIntegerDigits; out.fMaximumFractionDigits = decimalPos >= 0 ? (digitTotalCount - decimalPos) : 0; out.fMinimumFractionDigits = decimalPos >= 0 ? (digitLeftCount + zeroDigitCount - decimalPos) : 0; } out.fGroupingUsed = groupingCount > 0; out.fGroupingSize = (groupingCount > 0) ? groupingCount : 0; out.fGroupingSize2 = (groupingCount2 > 0 && groupingCount2 != groupingCount) ? groupingCount2 : 0; out.fMultiplier = multiplier; out.fDecimalSeparatorAlwaysShown = decimalPos == 0 || decimalPos == digitTotalCount; if (padPos >= 0) { out.fPadPosition = (DecimalFormatPattern::EPadPosition) padPos; // To compute the format width, first set up sub0Limit - // sub0Start. Add in prefix/suffix length later. // fFormatWidth = prefix.length() + suffix.length() + // sub0Limit - sub0Start; out.fFormatWidth = sub0Limit - sub0Start; out.fPad = padChar; } else { out.fFormatWidth = 0; } if (roundingPos >= 0) { out.fRoundingIncrementUsed = TRUE; roundingInc.setDecimalAt(effectiveDecimalPos - roundingPos); out.fRoundingIncrement = roundingInc; } else { out.fRoundingIncrementUsed = FALSE; } } else { out.fNegPatternsBogus = FALSE; out.fNegPrefixPattern = prefix; out.fNegSuffixPattern = suffix; } } if (pattern.length() == 0) { out.fNegPatternsBogus = TRUE; out.fNegPrefixPattern.remove(); out.fNegSuffixPattern.remove(); out.fPosPatternsBogus = FALSE; out.fPosPrefixPattern.remove(); out.fPosSuffixPattern.remove(); out.fMinimumIntegerDigits = 0; out.fMaximumIntegerDigits = kDoubleIntegerDigits; out.fMinimumFractionDigits = 0; out.fMaximumFractionDigits = kDoubleFractionDigits; out.fUseExponentialNotation = FALSE; out.fCurrencySignCount = fgCurrencySignCountZero; out.fGroupingUsed = FALSE; out.fGroupingSize = 0; out.fGroupingSize2 = 0; out.fMultiplier = 1; out.fDecimalSeparatorAlwaysShown = FALSE; out.fFormatWidth = 0; out.fRoundingIncrementUsed = FALSE; } // If there was no negative pattern, or if the negative pattern is // identical to the positive pattern, then prepend the minus sign to the // positive pattern to form the negative pattern. if (out.fNegPatternsBogus || (out.fNegPrefixPattern == out.fPosPrefixPattern && out.fNegSuffixPattern == out.fPosSuffixPattern)) { out.fNegPatternsBogus = FALSE; out.fNegSuffixPattern = out.fPosSuffixPattern; out.fNegPrefixPattern.remove(); out.fNegPrefixPattern.append(kQuote).append(kPatternMinus) .append(out.fPosPrefixPattern); } // TODO: Deprecate/Remove out.fNegSuffixPattern and 3 other fields. AffixPattern::parseAffixString( out.fNegSuffixPattern, out.fNegSuffixAffix, status); AffixPattern::parseAffixString( out.fPosSuffixPattern, out.fPosSuffixAffix, status); AffixPattern::parseAffixString( out.fNegPrefixPattern, out.fNegPrefixAffix, status); AffixPattern::parseAffixString( out.fPosPrefixPattern, out.fPosPrefixAffix, status); }
/* * Match each code point in a string against each code point in the matchSet. * Return the index of the first string code point that * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. * Return -(string length)-1 if there is no such code point. */ static int32_t _matchFromSet(const UChar* string, const UChar* matchSet, UBool polarity) { int32_t matchLen, matchBMPLen, strItr, matchItr; UChar32 stringCh, matchCh; UChar c, c2; /* first part of matchSet contains only BMP code points */ matchBMPLen = 0; while ((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { ++matchBMPLen; } /* second part of matchSet contains BMP and supplementary code points */ matchLen = matchBMPLen; while (matchSet[matchLen] != 0) { ++matchLen; } for (strItr = 0; (c = string[strItr]) != 0;) { ++strItr; if (U16_IS_SINGLE(c)) { if (polarity) { for (matchItr = 0; matchItr < matchLen; ++matchItr) { if (c == matchSet[matchItr]) { return strItr - 1; /* one matches */ } } } else { for (matchItr = 0; matchItr < matchLen; ++matchItr) { if (c == matchSet[matchItr]) { goto endloop; } } return strItr - 1; /* none matches */ } } else { /* * No need to check for string length before U16_IS_TRAIL * because c2 could at worst be the terminating NUL. */ if (U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { ++strItr; stringCh = U16_GET_SUPPLEMENTARY(c, c2); } else { stringCh = c; /* unpaired trail surrogate */ } if (polarity) { for (matchItr = matchBMPLen; matchItr < matchLen;) { U16_NEXT(matchSet, matchItr, matchLen, matchCh); if (stringCh == matchCh) { return strItr - U16_LENGTH(stringCh); /* one matches */ } } } else { for (matchItr = matchBMPLen; matchItr < matchLen;) { U16_NEXT(matchSet, matchItr, matchLen, matchCh); if (stringCh == matchCh) { goto endloop; } } return strItr - U16_LENGTH(stringCh); /* none matches */ } } endloop: /* wish C had continue with labels like Java... */; } /* Didn't find it. */ return -strItr - 1; }
IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } *fIdentifier = identifier; clear(); ScriptSet scriptsForCP; UChar32 cp; for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { cp = identifier.char32At(i); // Store a representative character for each kind of decimal digit if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); } UScriptCode extensions[500]; int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status); if (U_FAILURE(status)) { return *this; } scriptsForCP.resetAll(); for (int32_t j=0; j<extensionsCount; j++) { scriptsForCP.set(extensions[j], status); } scriptsForCP.reset(USCRIPT_COMMON, status); scriptsForCP.reset(USCRIPT_INHERITED, status); switch (scriptsForCP.countMembers()) { case 0: break; case 1: // Single script, record it. fRequiredScripts->Union(scriptsForCP); break; default: if (!fRequiredScripts->intersects(scriptsForCP) && !uhash_geti(fScriptSetSet, &scriptsForCP)) { // If the set hasn't been added already, add it // (Add a copy, fScriptSetSet takes ownership of the copy.) uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); } break; } } // Now make a final pass through ScriptSetSet to remove alternates that came before singles. // [Kana], [Kana Hira] => [Kana] // This is relatively infrequent, so doesn't have to be optimized. // We also compute any commonalities among the alternates. if (uhash_count(fScriptSetSet) > 0) { fCommonAmongAlternates->setAll(); for (int32_t it = UHASH_FIRST;;) { const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); if (nextHashEl == NULL) { break; } ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); // [Kana], [Kana Hira] => [Kana] if (fRequiredScripts->intersects(*next)) { uhash_removeElement(fScriptSetSet, nextHashEl); } else { fCommonAmongAlternates->intersect(*next); // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] for (int32_t otherIt = UHASH_FIRST;;) { const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); if (otherHashEl == NULL) { break; } ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); if (next != other && next->contains(*other)) { uhash_removeElement(fScriptSetSet, nextHashEl); break; } } } } } if (uhash_count(fScriptSetSet) == 0) { fCommonAmongAlternates->resetAll(); } return *this; }
/** * Implements {@link Transliterator#handleTransliterate}. */ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // The failure mode, here and below, is to behave like Any-Null, // if either there is no name data (max len == 0) or there is no // memory (malloc() => NULL). int32_t maxLen = uprv_getMaxCharNameLength(); if (maxLen == 0) { offsets.start = offsets.limit; return; } // Accomodate the longest possible name ++maxLen; // allow for temporary trailing space char* cbuf = (char*) uprv_malloc(maxLen); if (cbuf == NULL) { offsets.start = offsets.limit; return; } UnicodeString openPat(TRUE, OPEN, -1); UnicodeString str, name; int32_t cursor = offsets.start; int32_t limit = offsets.limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int32_t mode = 0; int32_t openPos = -1; // open delim candidate pos UChar32 c; while (cursor < limit) { c = text.char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == OPEN_DELIM) { // quick check first openPos = cursor; int32_t i = ICU_Utility::parsePattern(openPat, text, cursor, limit); if (i >= 0 && i < limit) { mode = 1; name.truncate(0); cursor = i; continue; // *** reprocess char32At(cursor) } } break; case 1: // after open delimiter // Look for legal chars. If \s+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is reached, exit the loop. // Convert \s+ => SPACE. This assumes there are no // runs of >1 space characters in names. if (PatternProps::isWhiteSpace(c)) { // Ignore leading whitespace if (name.length() > 0 && name.charAt(name.length()-1) != SPACE) { name.append(SPACE); // If we are too long then abort. maxLen includes // temporary trailing space, so use '>'. if (name.length() > maxLen) { mode = 0; } } break; } if (c == CLOSE_DELIM) { int32_t len = name.length(); // Delete trailing space, if any if (len > 0 && name.charAt(len-1) == SPACE) { --len; } if (uprv_isInvariantUString(name.getBuffer(), len)) { name.extract(0, len, cbuf, maxLen, US_INV); UErrorCode status = U_ZERO_ERROR; c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status); if (U_SUCCESS(status)) { // Lookup succeeded // assert(U16_LENGTH(CLOSE_DELIM) == 1); cursor++; // advance over CLOSE_DELIM str.truncate(0); str.append(c); text.handleReplaceBetween(openPos, cursor, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int32_t delta = cursor - openPos - str.length(); cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; openPos = -1; // close off candidate continue; // *** reprocess char32At(cursor) } // Check if c is a legal char. We assume here that // legal.contains(OPEN_DELIM) is FALSE, so when we abort a // name, we don't have to go back to openPos+1. if (legal.contains(c)) { name.append(c); // If we go past the longest possible name then abort. // maxLen includes temporary trailing space, so use '>='. if (name.length() >= maxLen) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } cursor += U16_LENGTH(c); } offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter candidate. offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; uprv_free(cbuf); }
/** * Implements {@link Transliterator#handleTransliterate}. */ void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // start and limit of the input range int32_t start = offsets.start; int32_t limit = offsets.limit; if(start >= limit) { return; } /* * Normalize as short chunks at a time as possible even in * bulk mode, so that styled text is minimally disrupted. * In incremental mode, a chunk that ends with offsets.limit * must not be normalized. * * If it was known that the input text is not styled, then * a bulk mode normalization could look like this: UnicodeString input, normalized; int32_t length = limit - start; _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); input.releaseBuffer(length); UErrorCode status = U_ZERO_ERROR; fNorm2.normalize(input, normalized, status); text.handleReplaceBetween(start, limit, normalized); int32_t delta = normalized.length() - length; offsets.contextLimit += delta; offsets.limit += delta; offsets.start = limit + delta; */ UErrorCode errorCode = U_ZERO_ERROR; UnicodeString segment; UnicodeString normalized; UChar32 c = text.char32At(start); do { int32_t prev = start; // Skip at least one character so we make progress. // c holds the character at start. segment.remove(); do { segment.append(c); start += U16_LENGTH(c); } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))); if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { // stop in incremental mode when we reach the input limit // in case there are additional characters that could change the // normalization result start=prev; break; } fNorm2.normalize(segment, normalized, errorCode); if(U_FAILURE(errorCode)) { break; } if(segment != normalized) { // replace the input chunk with its normalized form text.handleReplaceBetween(prev, start, normalized); // update all necessary indexes accordingly int32_t delta = normalized.length() - (start - prev); start += delta; limit += delta; } } while(start < limit); offsets.start = start; offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; }
U_CAPI int32_t U_EXPORT2 uspoof_checkUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &id, int32_t *position, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (This == NULL) { return 0; } int32_t result = 0; IdentifierInfo *identifierInfo = NULL; if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) { identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { goto cleanupAndReturn; } identifierInfo->setIdentifier(id, *status); identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet); } if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) { URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status); if (idRestrictionLevel > This->fRestrictionLevel) { result |= USPOOF_RESTRICTION_LEVEL; } if (This->fChecks & USPOOF_AUX_INFO) { result |= idRestrictionLevel; } } if ((This->fChecks) & USPOOF_MIXED_NUMBERS) { const UnicodeSet *numerics = identifierInfo->getNumerics(); if (numerics->size() > 1) { result |= USPOOF_MIXED_NUMBERS; } // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier. // We have no easy way to do the same in C. // if (checkResult != null) { // checkResult.numerics = numerics; // } } if (This->fChecks & (USPOOF_CHAR_LIMIT)) { int32_t i; UChar32 c; int32_t length = id.length(); for (i=0; i<length ;) { c = id.char32At(i); i += U16_LENGTH(c); if (!This->fAllowedCharsSet->contains(c)) { result |= USPOOF_CHAR_LIMIT; break; } } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { // These are the checks that need to be done on NFD input UnicodeString nfdText; gNfdNormalizer->normalize(id, nfdText, *status); int32_t nfdLength = nfdText.length(); if (This->fChecks & USPOOF_INVISIBLE) { // scan for more than one occurence of the same non-spacing mark // in a sequence of non-spacing marks. int32_t i; UChar32 c; UChar32 firstNonspacingMark = 0; UBool haveMultipleMarks = FALSE; UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; i<nfdLength ;) { c = nfdText.char32At(i); i += U16_LENGTH(c); if (u_charType(c) != U_NON_SPACING_MARK) { firstNonspacingMark = 0; if (haveMultipleMarks) { marksSeenSoFar.clear(); haveMultipleMarks = FALSE; } continue; } if (firstNonspacingMark == 0) { firstNonspacingMark = c; continue; } if (!haveMultipleMarks) { marksSeenSoFar.add(firstNonspacingMark); haveMultipleMarks = TRUE; } if (marksSeenSoFar.contains(c)) { // report the error, and stop scanning. // No need to find more than the first failure. result |= USPOOF_INVISIBLE; break; } marksSeenSoFar.add(c); } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { // The basic test is the same for both whole and mixed script confusables. // Compute the set of scripts that every input character has a confusable in. // For this computation an input character is always considered to be // confusable with itself in its own script. // // If the number of such scripts is two or more, and the input consisted of // characters all from a single script, we have a whole script confusable. // (The two scripts will be the original script and the one that is confusable) // // If the number of such scripts >= one, and the original input contained characters from // more than one script, we have a mixed script confusable. (We can transform // some of the characters, and end up with a visually similar string all in // one script.) if (identifierInfo == NULL) { identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { goto cleanupAndReturn; } identifierInfo->setIdentifier(id, *status); } int32_t scriptCount = identifierInfo->getScriptCount(); ScriptSet scripts; This->wholeScriptCheck(nfdText, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && confusableScriptCount >= 2 && scriptCount == 1) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && confusableScriptCount >= 1 && scriptCount > 1) { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; } } } cleanupAndReturn: This->releaseIdentifierInfo(identifierInfo); if (position != NULL) { *position = 0; } return result; }