void U_EXPORT2 Normalizer::normalize(const UnicodeString& source, UNormalizationMode mode, int32_t options, UnicodeString& result, UErrorCode &status) { if(source.isBogus() || U_FAILURE(status)) { result.setToBogus(); if(U_SUCCESS(status)) { status=U_ILLEGAL_ARGUMENT_ERROR; } } else { UnicodeString localDest; UnicodeString *dest; if(&source!=&result) { dest=&result; } else { // the source and result strings are the same object, use a temporary one dest=&localDest; } const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); if(U_SUCCESS(status)) { if(options&UNORM_UNICODE_3_2) { FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). normalize(source, *dest, status); } else { n2->normalize(source, *dest, status); } } if(dest==&localDest && U_SUCCESS(status)) { result=*dest; } } }
U_CAPI int32_t U_EXPORT2 unorm_concatenate(const UChar * left, int32_t leftLength, const UChar * right, int32_t rightLength, UChar * dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UErrorCode * pErrorCode) { const Normalizer2 * n2 = Normalizer2Factory::getInstance(mode, *pErrorCode); const UnicodeSet * uni32; if (options & UNORM_UNICODE_3_2) { uni32 = uniset_getUnicode32Instance(*pErrorCode); } else { uni32 = NULL; // unused } FilteredNormalizer2 fn2(*n2, *uni32); if (options & UNORM_UNICODE_3_2) { n2 = &fn2; } if (U_FAILURE(*pErrorCode)) { return 0; } if (destCapacity < 0 || (dest == NULL && destCapacity > 0) || left == NULL || leftLength < -1 || right == NULL || rightLength < -1 ) { *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* check for overlapping right and destination */ if (dest != NULL && ((right >= dest && right < (dest + destCapacity)) || (rightLength > 0 && dest >= right && dest < (right + rightLength))) ) { *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* allow left==dest */ UnicodeString destString; if (left == dest) { destString.setTo(dest, leftLength, destCapacity); } else { destString.setTo(dest, 0, destCapacity); destString.append(left, leftLength); } return n2->append(destString, UnicodeString(rightLength < 0, right, rightLength), *pErrorCode). extract(dest, destCapacity, *pErrorCode); }
U_CAPI UBool U_EXPORT2 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, UNormalizationMode mode, int32_t options, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); if(options&UNORM_UNICODE_3_2) { FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); return unorm2_isNormalized((const UNormalizer2 *)&fn2, src, srcLength, pErrorCode); } else { return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); } }
void Normalizer::init() { UErrorCode errorCode=U_ZERO_ERROR; fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); if(fOptions&UNORM_UNICODE_3_2) { delete fFilteredNorm2; fNorm2=fFilteredNorm2= new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); } if(U_FAILURE(errorCode)) { errorCode=U_ZERO_ERROR; fNorm2=Normalizer2Factory::getNoopInstance(errorCode); } }
U_CAPI UNormalizationCheckResult U_EXPORT2 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, UNormalizationMode mode, int32_t options, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); if(options&UNORM_UNICODE_3_2) { FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); return unorm2_quickCheck( reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), src, srcLength, pErrorCode); } else { return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); } }
UnicodeString & U_EXPORT2 Normalizer::concatenate(UnicodeString & left, UnicodeString & right, UnicodeString & result, UNormalizationMode mode, int32_t options, UErrorCode & errorCode) { if (left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { result.setToBogus(); if (U_SUCCESS(errorCode)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; } } else { UnicodeString localDest; UnicodeString * dest; if (&right != &result) { dest = &result; } else { // the right and result strings are the same object, use a temporary one dest = &localDest; } *dest = left; const Normalizer2 * n2 = Normalizer2Factory::getInstance(mode, errorCode); if (U_SUCCESS(errorCode)) { if (options & UNORM_UNICODE_3_2) { FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). append(*dest, right, errorCode); } else { n2->append(*dest, right, errorCode); } } if (dest == &localDest && U_SUCCESS(errorCode)) { result = *dest; } } return result; }
/** Public API for normalizing. */ U_CAPI int32_t U_EXPORT2 unorm_normalize(const UChar *src, int32_t srcLength, UNormalizationMode mode, int32_t options, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); if(options&UNORM_UNICODE_3_2) { FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); return unorm2_normalize((const UNormalizer2 *)&fn2, src, srcLength, dest, destCapacity, pErrorCode); } else { return unorm2_normalize((const UNormalizer2 *)n2, src, srcLength, dest, destCapacity, pErrorCode); } }
UBool Normalizer::isNormalized(const UnicodeString& source, UNormalizationMode mode, int32_t options, UErrorCode &status) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); if(U_SUCCESS(status)) { if(options&UNORM_UNICODE_3_2) { return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). isNormalized(source, status); } else { return n2->isNormalized(source, status); } } else { return FALSE; } }
UNormalizationCheckResult Normalizer::quickCheck(const UnicodeString& source, UNormalizationMode mode, int32_t options, UErrorCode &status) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); if(U_SUCCESS(status)) { if(options&UNORM_UNICODE_3_2) { return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). quickCheck(source, status); } else { return n2->quickCheck(source, status); } } else { return UNORM_MAYBE; } }
U_CAPI int32_t U_EXPORT2 unorm_concatenate(const UChar *left, int32_t leftLength, const UChar *right, int32_t rightLength, UChar *dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); if(options&UNORM_UNICODE_3_2) { const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } FilteredNormalizer2 fn2(*n2, *uni32); return _concatenate(left, leftLength, right, rightLength, dest, destCapacity, &fn2, pErrorCode); } return _concatenate(left, leftLength, right, rightLength, dest, destCapacity, n2, pErrorCode); }
static int32_t unorm_iterate(UCharIterator *src, UBool forward, UChar *dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UBool doNormalize, UBool *pNeededToNormalize, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); if(options&UNORM_UNICODE_3_2) { const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } FilteredNormalizer2 fn2(*n2, *uni32); return _iterate(src, forward, dest, destCapacity, &fn2, doNormalize, pNeededToNormalize, pErrorCode); } return _iterate(src, forward, dest, destCapacity, n2, doNormalize, pNeededToNormalize, pErrorCode); }
U_CAPI int32_t U_EXPORT2 unorm_compare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { /* argument checking */ if(U_FAILURE(*pErrorCode)) { return 0; } if(s1==0 || length1<-1 || s2==0 || length2<-1) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString fcd1, fcd2; int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); options|=_COMPARE_EQUIV; /* * UAX #21 Case Mappings, as fixed for Unicode version 4 * (see Jitterbug 2021), defines a canonical caseless match as * * A string X is a canonical caseless match * for a string Y if and only if * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) * * For better performance, we check for FCD (or let the caller tell us that * both strings are in FCD) for the inner normalization. * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that * case-folding preserves the FCD-ness of a string. * The outer normalization is then only performed by unorm_cmpEquivFold() * when there is a difference. * * Exception: When using the Turkic case-folding option, we do perform * full NFD first. This is because in the Turkic case precomposed characters * with 0049 capital I or 0069 small i fold differently whether they * are first decomposed or not, so an FCD check - a check only for * canonical order - is not sufficient. */ if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) { const Normalizer2 *n2; if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { n2=Normalizer2Factory::getNFDInstance(*pErrorCode); } else { n2=Normalizer2Factory::getFCDInstance(*pErrorCode); } if (U_FAILURE(*pErrorCode)) { return 0; } // check if s1 and/or s2 fulfill the FCD conditions const UnicodeSet *uni32; if(normOptions&UNORM_UNICODE_3_2) { uni32=uniset_getUnicode32Instance(*pErrorCode); } else { uni32=NULL; // unused } FilteredNormalizer2 fn2(*n2, *uni32); if(normOptions&UNORM_UNICODE_3_2) { n2=&fn2; } UnicodeString str1(length1<0, s1, length1); UnicodeString str2(length2<0, s2, length2); int32_t spanQCYes1=n2->spanQuickCheckYes(str1, *pErrorCode); int32_t spanQCYes2=n2->spanQuickCheckYes(str2, *pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } /* * ICU 2.4 had a further optimization: * If both strings were not in FCD, then they were both NFD'ed, * and the _COMPARE_EQUIV option was turned off. * It is not entirely clear that this is valid with the current * definition of the canonical caseless match. * Therefore, ICU 2.6 removes that optimization. */ if(spanQCYes1<str1.length()) { UnicodeString unnormalized=str1.tempSubString(spanQCYes1); fcd1.setTo(FALSE, str1.getBuffer(), spanQCYes1); n2->normalizeSecondAndAppend(fcd1, unnormalized, *pErrorCode); s1=fcd1.getBuffer(); length1=fcd1.length(); } if(spanQCYes2<str2.length()) { UnicodeString unnormalized=str2.tempSubString(spanQCYes2); fcd2.setTo(FALSE, str2.getBuffer(), spanQCYes2); n2->normalizeSecondAndAppend(fcd2, unnormalized, *pErrorCode); s2=fcd2.getBuffer(); length2=fcd2.length(); } } if(U_SUCCESS(*pErrorCode)) { return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode); } else { return 0; } }
static int32_t unorm_iterate(UCharIterator *src, UBool forward, UChar *dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UBool doNormalize, UBool *pNeededToNormalize, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); const UnicodeSet *uni32; if(options&UNORM_UNICODE_3_2) { uni32=uniset_getUnicode32Instance(*pErrorCode); } else { uni32=NULL; // unused } if(U_FAILURE(*pErrorCode)) { return 0; } FilteredNormalizer2 fn2(*n2, *uni32); if(options&UNORM_UNICODE_3_2) { n2=&fn2; } if( destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if(pNeededToNormalize!=NULL) { *pNeededToNormalize=FALSE; } if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { return u_terminateUChars(dest, destCapacity, 0, pErrorCode); } UnicodeString buffer; UChar32 c; if(forward) { /* get one character and ignore its properties */ buffer.append(uiter_next32(src)); /* get all following characters until we see a boundary */ while((c=uiter_next32(src))>=0) { if(n2->hasBoundaryBefore(c)) { /* back out the latest movement to stop at the boundary */ src->move(src, -U16_LENGTH(c), UITER_CURRENT); break; } else { buffer.append(c); } } } else { while((c=uiter_previous32(src))>=0) { /* always write this character to the front of the buffer */ buffer.insert(0, c); /* stop if this just-copied character is a boundary */ if(n2->hasBoundaryBefore(c)) { break; } } } UnicodeString destString(dest, 0, destCapacity); if(buffer.length()>0 && doNormalize) { n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { *pNeededToNormalize= destString!=buffer; } return destString.length(); } else { /* just copy the source characters */ return buffer.extract(dest, destCapacity, *pErrorCode); } }