U_CAPI void U_EXPORT2 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { return; } if (locale != NULL && *locale == 0) { csm->locale[0] = 0; csm->caseLocale = UCASE_LOC_ROOT; return; } int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) { *pErrorCode=U_ZERO_ERROR; /* we only really need the language code for case mappings */ length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); } if(length==sizeof(csm->locale)) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } if(U_SUCCESS(*pErrorCode)) { csm->caseLocale=UCASE_LOC_UNKNOWN; csm->caseLocale = ucase_getCaseLocale(csm->locale); } else { csm->locale[0]=0; csm->caseLocale = UCASE_LOC_ROOT; } }
U_CFUNC int32_t ustrcase_getCaseLocale(const char *locale) { if (locale == NULL) { locale = uloc_getDefault(); } if (*locale == 0) { return UCASE_LOC_ROOT; } else { return ucase_getCaseLocale(locale); } }
U_CFUNC int32_t U_CALLCONV ustrcase_internalToUpper(const UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { int32_t locCache = csm->locCache; if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) { return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, pErrorCode); } UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; return _caseMap( csm, ucase_toFullUpper, dest, destCapacity, src, &csc, 0, srcLength, pErrorCode); }
U_CAPI void U_EXPORT2 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { int32_t length; if(U_FAILURE(*pErrorCode)) { return; } length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) { *pErrorCode=U_ZERO_ERROR; /* we only really need the language code for case mappings */ length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); } if(length==sizeof(csm->locale)) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } csm->locCache=0; if(U_SUCCESS(*pErrorCode)) { ucase_getCaseLocale(csm->locale, &csm->locCache); } else { csm->locale[0]=0; } }
/* * Internal titlecasing function. */ static int32_t _toTitle(UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, UCaseContext *csc, int32_t srcLength, UErrorCode *pErrorCode) { const UChar *s; UChar32 c; int32_t prev, titleStart, titleLimit, idx, destIndex, length; UBool isFirstIndex; if(csm->iter!=NULL) { ubrk_setText(csm->iter, src, srcLength, pErrorCode); } else { csm->iter=ubrk_open(UBRK_WORD, csm->locale, src, srcLength, pErrorCode); } if(U_FAILURE(*pErrorCode)) { return 0; } /* set up local variables */ destIndex=0; prev=0; isFirstIndex=TRUE; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ if(isFirstIndex) { isFirstIndex=FALSE; idx=ubrk_first(csm->iter); } else { idx=ubrk_next(csm->iter); } if(idx==UBRK_DONE || idx>srcLength) { idx=srcLength; } /* * Unicode 4 & 5 section 3.13 Default Case Operations: * * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex * #29, "Text Boundaries." Between each pair of word boundaries, find the first * cased character F. If F exists, map F to default_title(F); then map each * subsequent character C to default_lower(C). * * In this implementation, segment [prev..index[ into 3 parts: * a) uncased characters (copy as-is) [prev..titleStart[ * b) first case letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev<idx) { /* find and copy uncased characters [prev..titleStart[ */ titleStart=titleLimit=prev; U16_NEXT(src, titleLimit, idx, c); if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { /* Adjust the titlecasing index (titleStart) to the next cased character. */ for(;;) { titleStart=titleLimit; if(titleLimit==idx) { /* * only uncased characters in [prev..index[ * stop with titleStart==titleLimit==index */ break; } U16_NEXT(src, titleLimit, idx, c); if(UCASE_NONE!=ucase_getType(csm->csp, c)) { break; /* cased letter at [titleStart..titleLimit[ */ } } length=titleStart-prev; if(length>0) { if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); } destIndex+=length; } } if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */ csc->cpStart=titleStart; csc->cpLimit=titleLimit; c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s); /* Special case Dutch IJ titlecasing */ if ( titleStart+1 < idx && ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH && ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { c=(UChar32) 0x004A; destIndex=appendResult(dest, destIndex, destCapacity, c, s); titleLimit++; } /* lowercase [titleLimit..index[ */ if(titleLimit<idx) { if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { /* Normal operation: Lowercase the rest of the word. */ destIndex+= _caseMap( csm, ucase_toFullLower, dest+destIndex, destCapacity-destIndex, src, csc, titleLimit, idx, pErrorCode); } else { /* Optionally just copy the rest of the word unchanged. */ length=idx-titleLimit; if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); } destIndex+=length; } } } } prev=idx; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
U_CFUNC int32_t U_CALLCONV ucasemap_internalUTF8ToTitle(const UCaseMap *csm, uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, UErrorCode *pErrorCode) { const UChar *s; UChar32 c; int32_t prev, titleStart, titleLimit, idx, destIndex, length; UBool isFirstIndex; if(U_FAILURE(*pErrorCode)) { return 0; } // Use the C++ abstract base class to minimize dependencies. // TODO: Change UCaseMap.iter to store a BreakIterator directly. BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter); /* set up local variables */ int32_t locCache=csm->locCache; UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; destIndex=0; prev=0; isFirstIndex=TRUE; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ if(isFirstIndex) { isFirstIndex=FALSE; idx=bi->first(); } else { idx=bi->next(); } if(idx==UBRK_DONE || idx>srcLength) { idx=srcLength; } /* * Unicode 4 & 5 section 3.13 Default Case Operations: * * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex * #29, "Text Boundaries." Between each pair of word boundaries, find the first * cased character F. If F exists, map F to default_title(F); then map each * subsequent character C to default_lower(C). * * In this implementation, segment [prev..index[ into 3 parts: * a) uncased characters (copy as-is) [prev..titleStart[ * b) first case letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev<idx) { /* find and copy uncased characters [prev..titleStart[ */ titleStart=titleLimit=prev; U8_NEXT(src, titleLimit, idx, c); if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { /* Adjust the titlecasing index (titleStart) to the next cased character. */ for(;;) { titleStart=titleLimit; if(titleLimit==idx) { /* * only uncased characters in [prev..index[ * stop with titleStart==titleLimit==index */ break; } U8_NEXT(src, titleLimit, idx, c); if(UCASE_NONE!=ucase_getType(csm->csp, c)) { break; /* cased letter at [titleStart..titleLimit[ */ } } length=titleStart-prev; if(length>0) { if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+prev, length); } destIndex+=length; } } if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */ csc.cpStart=titleStart; csc.cpLimit=titleLimit; c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s); /* Special case Dutch IJ titlecasing */ if ( titleStart+1 < idx && ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH && ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) && ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { c=0x004A; destIndex=appendResult(dest, destIndex, destCapacity, c, s); titleLimit++; } /* lowercase [titleLimit..index[ */ if(titleLimit<idx) { if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { /* Normal operation: Lowercase the rest of the word. */ destIndex+= _caseMap( csm, ucase_toFullLower, dest+destIndex, destCapacity-destIndex, src, &csc, titleLimit, idx, pErrorCode); } else { /* Optionally just copy the rest of the word unchanged. */ length=idx-titleLimit; if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+titleLimit, length); } destIndex+=length; } } } } prev=idx; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
U_CAPI int32_t U_EXPORT2 ucase_toFullLower(const UCaseProps *csp, UChar32 c, UCaseContextIterator *iter, void *context, const UChar **pString, const char *locale, int32_t *locCache) { UChar32 result=c; uint16_t props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { result=c+UCASE_GET_DELTA(props); } } else { const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; uint16_t excWord=*pe++; int32_t full; pe2=pe; if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { /* use hardcoded conditions and mappings */ int32_t loc=ucase_getCaseLocale(locale, locCache); /* * Test for conditional mappings first * (otherwise the unconditional default mappings are always taken), * then test for characters that have unconditional mappings in SpecialCasing.txt, * then get the UnicodeData.txt mappings. */ if( loc==UCASE_LOC_LITHUANIAN && /* base characters, find accents above */ (((c==0x49 || c==0x4a || c==0x12e) && isFollowedByMoreAbove(csp, iter, context)) || /* precomposed with accent above, no need to find one */ (c==0xcc || c==0xcd || c==0x128)) ) { /* # Lithuanian # Lithuanian retains the dot in a lowercase i when followed by accents. # Introduce an explicit dot above when lowercasing capital I's and J's # whenever there are more accents above. # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE */ switch(c) { case 0x49: /* LATIN CAPITAL LETTER I */ *pString=iDot; return 2; case 0x4a: /* LATIN CAPITAL LETTER J */ *pString=jDot; return 2; case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ *pString=iOgonekDot; return 2; case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ *pString=iDotGrave; return 3; case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ *pString=iDotAcute; return 3; case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ *pString=iDotTilde; return 3; default: return 0; /* will not occur */ } /* # Turkish and Azeri */ } else if(loc==UCASE_LOC_TURKISH && c==0x130) { /* # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # The following rules handle those cases. 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE */ return 0x69; } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) { /* # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. # This matches the behavior of the canonically equivalent I-dot_above 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE */ return 0; /* remove the dot (continue without output) */ } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) { /* # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I */ return 0x131; } else if(c==0x130) { /* # Preserve canonical equivalence for I with dot. Turkic is handled below. 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ *pString=iDot; return 2; } else if( c==0x3a3 && !isFollowedByCasedLetter(csp, iter, context, 1) && isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */ ) { /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ /* # Special case for final form of sigma 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA */ return 0x3c2; /* greek small final sigma */ } else { /* no known conditional special case mapping, use a normal mapping */ } } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); full&=UCASE_FULL_LOWER; if(full!=0) { /* set the output pointer to the lowercase mapping */ *pString=pe+1; /* return the string length */ return full; } } if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); } } return (result==c) ? ~result : result; }
/* internal */ static int32_t toUpperOrTitle(const UCaseProps *csp, UChar32 c, UCaseContextIterator *iter, void *context, const UChar **pString, const char *locale, int32_t *locCache, UBool upperNotTitle) { UChar32 result=c; uint16_t props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { result=c+UCASE_GET_DELTA(props); } } else { const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; uint16_t excWord=*pe++; int32_t full, idx; pe2=pe; if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { /* use hardcoded conditions and mappings */ int32_t loc=ucase_getCaseLocale(locale, locCache); if(loc==UCASE_LOC_TURKISH && c==0x69) { /* # Turkish and Azeri # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # The following rules handle those cases. # When uppercasing, i turns into a dotted capital I 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I */ return 0x130; } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) { /* # Lithuanian # Lithuanian retains the dot in a lowercase i when followed by accents. # Remove DOT ABOVE after "i" with upper or titlecase 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE */ return 0; /* remove the dot (continue without output) */ } else { /* no known conditional special case mapping, use a normal mapping */ } } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); /* start of full case mapping strings */ ++pe; /* skip the lowercase and case-folding result strings */ pe+=full&UCASE_FULL_LOWER; full>>=4; pe+=full&0xf; full>>=4; if(upperNotTitle) { full&=0xf; } else { /* skip the uppercase result string */ pe+=full&0xf; full=(full>>4)&0xf; } if(full!=0) { /* set the output pointer to the result string */ *pString=pe; /* return the string length */ return full; } } if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { idx=UCASE_EXC_TITLE; } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { /* here, titlecase is same as uppercase */ idx=UCASE_EXC_UPPER; } else { return ~c; } GET_SLOT_VALUE(excWord, idx, pe2, result); } return (result==c) ? ~result : result; }