/** * Constructs a transliterator. */ CaseMapTransliterator::CaseMapTransliterator(const UnicodeString &id, UCaseMapFull *map) : Transliterator(id, 0), fCsp(ucase_getSingleton()), fMap(map) { // TODO test incremental mode with context-sensitive text (e.g. greek sigma) // TODO need to call setMaximumContextLength()?! }
/* * Set parameters on an empty UCaseMap, for UCaseMap-less API functions. * Do this fast because it is called with every function call. */ static U_INLINE void setTempCaseMap(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { if(csm->csp==NULL) { csm->csp=ucase_getSingleton(); } if(locale!=NULL && locale[0]==0) { csm->locale[0]=0; } else { setTempCaseMapLocale(csm, locale, pErrorCode); } }
/** * Constructs a transliterator. */ CaseMapTransliterator::CaseMapTransliterator(const UnicodeString &id, UCaseMapFull *map) : Transliterator(id, 0), fCsp(NULL), fMap(map) { UErrorCode errorCode = U_ZERO_ERROR; fCsp = ucase_getSingleton(&errorCode); // expect to get NULL if failure // TODO test incremental mode with context-sensitive text (e.g. greek sigma) // TODO need to call setMaximumContextLength()?! }
/* * Set parameters on an empty UCaseMap, for UCaseMap-less API functions. * Do this fast because it is called with every function call. * Duplicate of the same function in ustrcase.cpp, to keep it inline. */ static inline void setTempCaseMap(UCaseMap *csm, const char *locale) { if(csm->csp==NULL) { csm->csp=ucase_getSingleton(); } if(locale!=NULL && locale[0]==0) { csm->locale[0]=0; } else { ustrcase_setTempCaseMapLocale(csm, locale); } }
U_CAPI int32_t U_EXPORT2 u_strFoldCase(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, uint32_t options, UErrorCode *pErrorCode) { UCaseMap csm={ NULL }; csm.csp=ucase_getSingleton(); csm.options=options; return caseMap(&csm, dest, destCapacity, src, srcLength, FOLD_CASE, pErrorCode); }
U_CAPI int32_t U_EXPORT2 u_strFoldCase(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, uint32_t options, UErrorCode *pErrorCode) { UCaseMap csm=UCASEMAP_INITIALIZER; csm.csp=ucase_getSingleton(); csm.options=options; return ustrcase_map( &csm, dest, destCapacity, src, srcLength, ustrcase_internalFold, pErrorCode); }
static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { UnicodeString nfd; UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode); if(U_FAILURE(errorCode)) { return FALSE; } if(nfcNorm2->getDecomposition(c, nfd)) { /* c has a decomposition */ if(nfd.length()==1) { c=nfd[0]; /* single BMP code point */ } else if(nfd.length()<=U16_MAX_LENGTH && nfd.length()==U16_LENGTH(c=nfd.char32At(0)) ) { /* single supplementary code point */ } else { c=U_SENTINEL; } } else if(c<0) { return FALSE; /* protect against bad input */ } if(c>=0) { /* single code point */ const UCaseProps *csp=ucase_getSingleton(); const UChar *resultString; return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0); } else { /* guess some large but stack-friendly capacity */ UChar dest[2*UCASE_MAX_STRING_LENGTH]; int32_t destLength; destLength=u_strFoldCase(dest, LENGTHOF(dest), nfd.getBuffer(), nfd.length(), U_FOLD_CASE_DEFAULT, &errorCode); return (UBool)(U_SUCCESS(errorCode) && 0!=u_strCompare(nfd.getBuffer(), nfd.length(), dest, destLength, FALSE)); } }
static const UCaseProps * getCaseProps() { /* * This lazy intialization with double-checked locking (without mutex protection for * the initial check) is transiently unsafe under certain circumstances. * Check the readme and use u_init() if necessary. */ /* the initial check is performed by the GET_CASE_PROPS() macro */ const UCaseProps *csp; UErrorCode errorCode=U_ZERO_ERROR; csp=ucase_getSingleton(&errorCode); if(U_FAILURE(errorCode)) { errorCode=U_ZERO_ERROR; csp=ucase_getDummy(&errorCode); if(U_FAILURE(errorCode)) { return NULL; } } return csp; }
U_CAPI UCaseMap * U_EXPORT2 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { UCaseMap *csm; if(U_FAILURE(*pErrorCode)) { return NULL; } csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap)); if(csm==NULL) { return NULL; } uprv_memset(csm, 0, sizeof(UCaseMap)); csm->csp=ucase_getSingleton(pErrorCode); ucasemap_setLocale(csm, locale, pErrorCode); if(U_FAILURE(*pErrorCode)) { uprv_free(csm); return NULL; } csm->options=options; return csm; }
static int32_t caseMap(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBreakIterator *titleIter, const char *locale, uint32_t options, int32_t toWhichCase, UErrorCode *pErrorCode) { UChar buffer[300]; UChar *temp; const UCaseProps *csp; int32_t destLength; UBool ownTitleIter; /* check argument values */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } if( destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL || srcLength<-1 ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } csp=ucase_getSingleton(pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } /* get the string length */ if(srcLength==-1) { srcLength=u_strlen(src); } /* check for overlapping source and destination */ if( dest!=NULL && ((src>=dest && src<(dest+destCapacity)) || (dest>=src && dest<(src+srcLength))) ) { /* overlap: provide a temporary destination buffer and later copy the result */ if(destCapacity<=(sizeof(buffer)/U_SIZEOF_UCHAR)) { /* the stack buffer is large enough */ temp=buffer; } else { /* allocate a buffer */ temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); if(temp==NULL) { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return 0; } } } else { temp=dest; } ownTitleIter=FALSE; destLength=0; if(toWhichCase==FOLD_CASE) { destLength=ustr_foldCase(csp, temp, destCapacity, src, srcLength, options, pErrorCode); } else { UCaseContext csc={ NULL }; int32_t locCache; csc.p=(void *)src; csc.limit=srcLength; locCache=0; /* the internal functions require locale!=NULL */ if(locale==NULL) { locale=uloc_getDefault(); } if(toWhichCase==TO_LOWER) { destLength=_caseMap(csp, ucase_toFullLower, temp, destCapacity, src, &csc, 0, srcLength, locale, &locCache, pErrorCode); } else if(toWhichCase==TO_UPPER) { destLength=_caseMap(csp, ucase_toFullUpper, temp, destCapacity, src, &csc, 0, srcLength, locale, &locCache, pErrorCode); } else /* if(toWhichCase==TO_TITLE) */ { #if UCONFIG_NO_BREAK_ITERATION *pErrorCode=U_UNSUPPORTED_ERROR; #else if(titleIter==NULL) { titleIter=ubrk_open(UBRK_WORD, locale, src, srcLength, pErrorCode); ownTitleIter=(UBool)U_SUCCESS(*pErrorCode); } if(U_SUCCESS(*pErrorCode)) { destLength=_toTitle(csp, temp, destCapacity, src, &csc, srcLength, titleIter, locale, &locCache, pErrorCode); } #endif } } if(temp!=dest) { /* copy the result string to the destination buffer */ if(destLength>0) { int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity; if(copyLength>0) { uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR); } } if(temp!=buffer) { uprv_free(temp); } } #if !UCONFIG_NO_BREAK_ITERATION if(ownTitleIter) { ubrk_close(titleIter); } #endif return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); }
U_NAMESPACE_BEGIN CaseFoldingUTextIterator::CaseFoldingUTextIterator(UText &text) : fUText(text), fcsp(NULL), fFoldChars(NULL), fFoldLength(0) { fcsp = ucase_getSingleton(); }
U_CDECL_END U_NAMESPACE_BEGIN /* Reduce excessive reallocation, and make it easier to detect initialization problems. Usually you don't see smaller sets than this for Unicode 5.0. */ #define DEFAULT_INCLUSION_CAPACITY 3072 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { UBool needInit; UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); if (needInit) { UnicodeSet* incl = new UnicodeSet(); USetAdder sa = { (USet *)incl, _set_add, _set_addRange, _set_addString, NULL, // don't need remove() NULL // don't need removeRange() }; incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); if (incl != NULL) { switch(src) { case UPROPS_SRC_CHAR: uchar_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_PROPSVEC: upropsvec_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_CHAR_AND_PROPSVEC: uchar_addPropertyStarts(&sa, &status); upropsvec_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_HST: uhst_addPropertyStarts(&sa, &status); break; #if !UCONFIG_NO_NORMALIZATION case UPROPS_SRC_NORM: unorm_addPropertyStarts(&sa, &status); break; #endif case UPROPS_SRC_CASE: ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status); break; case UPROPS_SRC_BIDI: ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status); break; default: status = U_INTERNAL_PROGRAM_ERROR; break; } if (U_SUCCESS(status)) { // Compact for caching incl->compact(); umtx_lock(NULL); if (INCLUSIONS[src] == NULL) { INCLUSIONS[src] = incl; incl = NULL; ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); } umtx_unlock(NULL); } delete incl; } else { status = U_MEMORY_ALLOCATION_ERROR; } } return INCLUSIONS[src]; }
U_CAPI UBool U_EXPORT2 u_hasBinaryProperty(UChar32 c, UProperty which) { /* c is range-checked in the functions that are called from here */ if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { /* not a known binary property */ } else { uint32_t mask=binProps[which].mask; int32_t column=binProps[which].column; if(mask!=0) { /* systematic, directly stored properties */ return (u_getUnicodeProperties(c, column)&mask)!=0; } else { if(column==UPROPS_SRC_CASE) { return ucase_hasBinaryProperty(c, which); } else if(column==UPROPS_SRC_NORM) { #if !UCONFIG_NO_NORMALIZATION /* normalization properties from unorm.icu */ switch(which) { case UCHAR_SEGMENT_STARTER: return unorm_isCanonSafeStart(c); default: break; } #endif } else if(column==UPROPS_SRC_NFC) { #if !UCONFIG_NO_NORMALIZATION UErrorCode errorCode=U_ZERO_ERROR; switch(which) { case UCHAR_FULL_COMPOSITION_EXCLUSION: { // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); break; } default: { // UCHAR_NF[CD]_INERT properties const Normalizer2 *norm2=Normalizer2Factory::getInstance( (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); return U_SUCCESS(errorCode) && norm2->isInert(c); } } #endif } else if(column==UPROPS_SRC_NFKC) { #if !UCONFIG_NO_NORMALIZATION // UCHAR_NFK[CD]_INERT properties UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2 *norm2=Normalizer2Factory::getInstance( (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); return U_SUCCESS(errorCode) && norm2->isInert(c); #endif } else if(column==UPROPS_SRC_NFKC_CF) { // currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED #if !UCONFIG_NO_NORMALIZATION UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); if(U_SUCCESS(errorCode)) { UnicodeString src(c); UnicodeString dest; { // The ReorderingBuffer must be in a block because its destructor // needs to release dest's buffer before we look at its contents. ReorderingBuffer buffer(*kcf, dest); // Small destCapacity for NFKC_CF(c). if(buffer.init(5, errorCode)) { const UChar *srcArray=src.getBuffer(); kcf->compose(srcArray, srcArray+src.length(), FALSE, TRUE, buffer, errorCode); } } return U_SUCCESS(errorCode) && dest!=src; } #endif } else if(column==UPROPS_SRC_BIDI) { /* bidi/shaping properties */ const UBiDiProps *bdp=GET_BIDI_PROPS(); if(bdp!=NULL) { switch(which) { case UCHAR_BIDI_MIRRORED: return ubidi_isMirrored(bdp, c); case UCHAR_BIDI_CONTROL: return ubidi_isBidiControl(bdp, c); case UCHAR_JOIN_CONTROL: return ubidi_isJoinControl(bdp, c); default: break; } } /* else return FALSE below */ } else if(column==UPROPS_SRC_CHAR) { switch(which) { case UCHAR_POSIX_BLANK: return u_isblank(c); case UCHAR_POSIX_GRAPH: return u_isgraphPOSIX(c); case UCHAR_POSIX_PRINT: return u_isprintPOSIX(c); case UCHAR_POSIX_XDIGIT: return u_isxdigit(c); default: break; } } else if(column==UPROPS_SRC_CHAR_AND_PROPSVEC) { switch(which) { case UCHAR_POSIX_ALNUM: return u_isalnumPOSIX(c); default: break; } } else if(column==UPROPS_SRC_CASE_AND_NORM) { #if !UCONFIG_NO_NORMALIZATION UChar nfdBuffer[4]; const UChar *nfd; int32_t nfdLength; UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); if(U_FAILURE(errorCode)) { return FALSE; } switch(which) { case UCHAR_CHANGES_WHEN_CASEFOLDED: nfd=nfcImpl->getDecomposition(c, nfdBuffer, nfdLength); if(nfd!=NULL) { /* c has a decomposition */ if(nfdLength==1) { c=nfd[0]; /* single BMP code point */ } else if(nfdLength<=U16_MAX_LENGTH) { int32_t i=0; U16_NEXT(nfd, i, nfdLength, c); if(i==nfdLength) { /* single supplementary code point */ } else { c=U_SENTINEL; } } else { c=U_SENTINEL; } } else if(c<0) { return FALSE; /* protect against bad input */ } errorCode=U_ZERO_ERROR; if(c>=0) { /* single code point */ const UCaseProps *csp=ucase_getSingleton(&errorCode); const UChar *resultString; return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0); } else { /* guess some large but stack-friendly capacity */ UChar dest[2*UCASE_MAX_STRING_LENGTH]; int32_t destLength; destLength=u_strFoldCase(dest, LENGTHOF(dest), nfd, nfdLength, U_FOLD_CASE_DEFAULT, &errorCode); return (UBool)(U_SUCCESS(errorCode) && 0!=u_strCompare(nfd, nfdLength, dest, destLength, FALSE)); } default: break; } #endif } } } return FALSE; }
U_CDECL_END U_NAMESPACE_BEGIN #define DEFAULT_INCLUSION_CAPACITY 3072 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { UBool needInit; UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); if (needInit) { UnicodeSet* incl = new UnicodeSet(); USetAdder sa = { (USet *)incl, _set_add, _set_addRange, _set_addString, NULL, // don't need remove() NULL // don't need removeRange() }; incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); if (incl != NULL) { switch(src) { case UPROPS_SRC_CHAR: uchar_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_PROPSVEC: upropsvec_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_CHAR_AND_PROPSVEC: uchar_addPropertyStarts(&sa, &status); upropsvec_addPropertyStarts(&sa, &status); break; #if !UCONFIG_NO_NORMALIZATION case UPROPS_SRC_NORM: unorm_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_CASE_AND_NORM: ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status); unorm_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_NFC: { const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); if(U_SUCCESS(status)) { impl->addPropertyStarts(&sa, status); } break; } case UPROPS_SRC_NFKC: { const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); if(U_SUCCESS(status)) { impl->addPropertyStarts(&sa, status); } break; } case UPROPS_SRC_NFKC_CF: { const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); if(U_SUCCESS(status)) { impl->addPropertyStarts(&sa, status); } break; } #endif case UPROPS_SRC_CASE: ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status); break; case UPROPS_SRC_BIDI: ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status); break; default: status = U_INTERNAL_PROGRAM_ERROR; break; } if (U_SUCCESS(status)) { // Compact for caching incl->compact(); umtx_lock(NULL); if (INCLUSIONS[src] == NULL) { INCLUSIONS[src] = incl; incl = NULL; ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); } umtx_unlock(NULL); } delete incl; } else { status = U_MEMORY_ALLOCATION_ERROR; } } return INCLUSIONS[src]; }
U_CDECL_END U_NAMESPACE_BEGIN /* Reduce excessive reallocation, and make it easier to detect initialization problems. Usually you don't see smaller sets than this for Unicode 5.0. */ #define DEFAULT_INCLUSION_CAPACITY 3072 void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) { // This function is invoked only via umtx_initOnce(). // This function is a friend of class UnicodeSet. U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); UnicodeSet * &incl = gInclusions[src].fSet; U_ASSERT(incl == NULL); incl = new UnicodeSet(); if (incl == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } USetAdder sa = { (USet *)incl, _set_add, _set_addRange, _set_addString, NULL, // don't need remove() NULL // don't need removeRange() }; incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); switch(src) { case UPROPS_SRC_CHAR: uchar_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_PROPSVEC: upropsvec_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_CHAR_AND_PROPSVEC: uchar_addPropertyStarts(&sa, &status); upropsvec_addPropertyStarts(&sa, &status); break; #if !UCONFIG_NO_NORMALIZATION case UPROPS_SRC_CASE_AND_NORM: { const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); if(U_SUCCESS(status)) { impl->addPropertyStarts(&sa, status); } ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); break; } case UPROPS_SRC_NFC: { const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); if(U_SUCCESS(status)) { impl->addPropertyStarts(&sa, status); } break; } case UPROPS_SRC_NFKC: { const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); if(U_SUCCESS(status)) { impl->addPropertyStarts(&sa, status); } break; } case UPROPS_SRC_NFKC_CF: { const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); if(U_SUCCESS(status)) { impl->addPropertyStarts(&sa, status); } break; } case UPROPS_SRC_NFC_CANON_ITER: { const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); if(U_SUCCESS(status)) { impl->addCanonIterPropertyStarts(&sa, status); } break; } #endif case UPROPS_SRC_CASE: ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); break; case UPROPS_SRC_BIDI: ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); break; default: status = U_INTERNAL_PROGRAM_ERROR; break; } if (U_FAILURE(status)) { delete incl; incl = NULL; return; } // Compact for caching incl->compact(); ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); }
/* internal function */ U_CFUNC int32_t u_strcmpFold(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { const UCaseProps *csp; /* current-level start/limit - s1/s2 as current */ const UChar *start1, *start2, *limit1, *limit2; /* case folding variables */ const UChar *p; int32_t length; /* stacks of previous-level start/current/limit */ CmpEquivLevel stack1[2], stack2[2]; /* case folding buffers, only use current-level start/limit */ UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; /* track which is the current level per string */ int32_t level1, level2; /* current code units, and code points for lookups */ UChar32 c1, c2, cp1, cp2; /* no argument error checking because this itself is not an API */ /* * assume that at least the option U_COMPARE_IGNORE_CASE is set * otherwise this function would have to behave exactly as uprv_strCompare() */ csp=ucase_getSingleton(); if(U_FAILURE(*pErrorCode)) { return 0; } /* initialize */ start1=s1; if(length1==-1) { limit1=NULL; } else { limit1=s1+length1; } start2=s2; if(length2==-1) { limit2=NULL; } else { limit2=s2+length2; } level1=level2=0; c1=c2=-1; /* comparison loop */ for(;;) { /* * here a code unit value of -1 means "get another code unit" * below it will mean "this source is finished" */ if(c1<0) { /* get next code unit from string 1, post-increment */ for(;;) { if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { if(level1==0) { c1=-1; break; } } else { ++s1; break; } /* reached end of level buffer, pop one level */ do { --level1; start1=stack1[level1].start; } while(start1==NULL); s1=stack1[level1].s; limit1=stack1[level1].limit; } } if(c2<0) { /* get next code unit from string 2, post-increment */ for(;;) { if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { if(level2==0) { c2=-1; break; } } else { ++s2; break; } /* reached end of level buffer, pop one level */ do { --level2; start2=stack2[level2].start; } while(start2==NULL); s2=stack2[level2].s; limit2=stack2[level2].limit; } } /* * compare c1 and c2 * either variable c1, c2 is -1 only if the corresponding string is finished */ if(c1==c2) { if(c1<0) { return 0; /* c1==c2==-1 indicating end of strings */ } c1=c2=-1; /* make us fetch new code units */ continue; } else if(c1<0) { return -1; /* string 1 ends before string 2 */ } else if(c2<0) { return 1; /* string 2 ends before string 1 */ } /* c1!=c2 && c1>=0 && c2>=0 */ /* get complete code points for c1, c2 for lookups if either is a surrogate */ cp1=c1; if(U_IS_SURROGATE(c1)) { UChar c; if(U_IS_SURROGATE_LEAD(c1)) { if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { /* advance ++s1; only below if cp1 decomposes/case-folds */ cp1=U16_GET_SUPPLEMENTARY(c1, c); } } else /* isTrail(c1) */ { if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { cp1=U16_GET_SUPPLEMENTARY(c, c1); } } } cp2=c2; if(U_IS_SURROGATE(c2)) { UChar c; if(U_IS_SURROGATE_LEAD(c2)) { if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { /* advance ++s2; only below if cp2 decomposes/case-folds */ cp2=U16_GET_SUPPLEMENTARY(c2, c); } } else /* isTrail(c2) */ { if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { cp2=U16_GET_SUPPLEMENTARY(c, c2); } } } /* * go down one level for each string * continue with the main loop as soon as there is a real change */ if( level1==0 && (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 ) { /* cp1 case-folds to the code point "length" or to p[length] */ if(U_IS_SURROGATE(c1)) { if(U_IS_SURROGATE_LEAD(c1)) { /* advance beyond source surrogate pair if it case-folds */ ++s1; } else /* isTrail(c1) */ { /* * we got a supplementary code point when hitting its trail surrogate, * therefore the lead surrogate must have been the same as in the other string; * compare this decomposition with the lead surrogate in the other string * remember that this simulates bulk text replacement: * the decomposition would replace the entire code point */ --s2; c2=*(s2-1); } } /* push current level pointers */ stack1[0].start=start1; stack1[0].s=s1; stack1[0].limit=limit1; ++level1; /* copy the folding result to fold1[] */ if(length<=UCASE_MAX_STRING_LENGTH) { u_memcpy(fold1, p, length); } else { int32_t i=0; U16_APPEND_UNSAFE(fold1, i, length); length=i; } /* set next level pointers to case folding */ start1=s1=fold1; limit1=fold1+length; /* get ready to read from decomposition, continue with loop */ c1=-1; continue; } if( level2==0 && (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 ) { /* cp2 case-folds to the code point "length" or to p[length] */ if(U_IS_SURROGATE(c2)) { if(U_IS_SURROGATE_LEAD(c2)) { /* advance beyond source surrogate pair if it case-folds */ ++s2; } else /* isTrail(c2) */ { /* * we got a supplementary code point when hitting its trail surrogate, * therefore the lead surrogate must have been the same as in the other string; * compare this decomposition with the lead surrogate in the other string * remember that this simulates bulk text replacement: * the decomposition would replace the entire code point */ --s1; c1=*(s1-1); } } /* push current level pointers */ stack2[0].start=start2; stack2[0].s=s2; stack2[0].limit=limit2; ++level2; /* copy the folding result to fold2[] */ if(length<=UCASE_MAX_STRING_LENGTH) { u_memcpy(fold2, p, length); } else { int32_t i=0; U16_APPEND_UNSAFE(fold2, i, length); length=i; } /* set next level pointers to case folding */ start2=s2=fold2; limit2=fold2+length; /* get ready to read from decomposition, continue with loop */ c2=-1; continue; } /* * no decomposition/case folding, max level for both sides: * return difference result * * code point order comparison must not just return cp1-cp2 * because when single surrogates are present then the surrogate pairs * that formed cp1 and cp2 may be from different string indexes * * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units * c1=d800 cp1=10001 c2=dc00 cp2=10000 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } * * therefore, use same fix-up as in ustring.c/uprv_strCompare() * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ * so we have slightly different pointer/start/limit comparisons here */ if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if( (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c1-=0x2800; } if( (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c2-=0x2800; } } return c1-c2; } }
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { if (isFrozen() || isBogus()) { return *this; } if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { const UCaseProps *csp = ucase_getSingleton(); { UnicodeSet foldSet(*this); UnicodeString str; USetAdder sa = { foldSet.toUSet(), _set_add, _set_addRange, _set_addString, NULL, // don't need remove() NULL // don't need removeRange() }; // start with input set to guarantee inclusion // USET_CASE: remove strings because the strings will actually be reduced (folded); // therefore, start with no strings and add only those needed if (attribute & USET_CASE_INSENSITIVE) { foldSet.strings->removeAllElements(); } int32_t n = getRangeCount(); UChar32 result; const UChar *full; int32_t locCache = 0; for (int32_t i=0; i<n; ++i) { UChar32 start = getRangeStart(i); UChar32 end = getRangeEnd(i); if (attribute & USET_CASE_INSENSITIVE) { // full case closure for (UChar32 cp=start; cp<=end; ++cp) { ucase_addCaseClosure(csp, cp, &sa); } } else { // add case mappings // (does not add long s for regular s, or Kelvin for k, for example) for (UChar32 cp=start; cp<=end; ++cp) { result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); addCaseMapping(foldSet, result, full, str); result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); addCaseMapping(foldSet, result, full, str); result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); addCaseMapping(foldSet, result, full, str); result = ucase_toFullFolding(csp, cp, &full, 0); addCaseMapping(foldSet, result, full, str); } } } if (strings != NULL && strings->size() > 0) { if (attribute & USET_CASE_INSENSITIVE) { for (int32_t j=0; j<strings->size(); ++j) { str = *(const UnicodeString *) strings->elementAt(j); str.foldCase(); if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { foldSet.add(str); // does not map to code points: add the folded string itself } } } else { Locale root(""); #if !UCONFIG_NO_BREAK_ITERATION UErrorCode status = U_ZERO_ERROR; BreakIterator *bi = BreakIterator::createWordInstance(root, status); if (U_SUCCESS(status)) { #endif const UnicodeString *pStr; for (int32_t j=0; j<strings->size(); ++j) { pStr = (const UnicodeString *) strings->elementAt(j); (str = *pStr).toLower(root); foldSet.add(str); #if !UCONFIG_NO_BREAK_ITERATION (str = *pStr).toTitle(bi, root); foldSet.add(str); #endif (str = *pStr).toUpper(root); foldSet.add(str); (str = *pStr).foldCase(); foldSet.add(str); } #if !UCONFIG_NO_BREAK_ITERATION } delete bi; #endif } } *this = foldSet; } } return *this; }
CaseFoldingUCharIterator::CaseFoldingUCharIterator(const UChar *chars, int64_t start, int64_t limit) : fChars(chars), fIndex(start), fLimit(limit), fcsp(NULL), fFoldChars(NULL), fFoldLength(0) { fcsp = ucase_getSingleton(); }
U_CDECL_END const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { umtx_lock(NULL); UBool f = (INCLUSIONS[src] == NULL); umtx_unlock(NULL); if (f) { UnicodeSet* incl = new UnicodeSet(); USetAdder sa = { (USet *)incl, _set_add, _set_addRange, _set_addString, NULL // don't need remove() }; if (incl != NULL) { switch(src) { case UPROPS_SRC_CHAR: uchar_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_PROPSVEC: upropsvec_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_CHAR_AND_PROPSVEC: uchar_addPropertyStarts(&sa, &status); upropsvec_addPropertyStarts(&sa, &status); break; case UPROPS_SRC_HST: uhst_addPropertyStarts(&sa, &status); break; #if !UCONFIG_NO_NORMALIZATION case UPROPS_SRC_NORM: unorm_addPropertyStarts(&sa, &status); break; #endif case UPROPS_SRC_CASE: ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status); break; case UPROPS_SRC_BIDI: ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status); break; default: status = U_INTERNAL_PROGRAM_ERROR; break; } if (U_SUCCESS(status)) { umtx_lock(NULL); if (INCLUSIONS[src] == NULL) { INCLUSIONS[src] = incl; incl = NULL; ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); } umtx_unlock(NULL); } delete incl; } else { status = U_MEMORY_ALLOCATION_ERROR; } } return INCLUSIONS[src]; }