U_CAPI UChar32 U_EXPORT2 ucase_toupper(const UCaseProps *csp, UChar32 c) { uint16_t props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { c+=UCASE_GET_DELTA(props); } } else { const uint16_t *pe=GET_EXCEPTIONS(csp, props); uint16_t excWord=*pe++; if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); } } return c; }
/* return the simple case folding mapping for c */ U_CAPI UChar32 U_EXPORT2 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) { uint16_t props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { c+=UCASE_GET_DELTA(props); } } else { const uint16_t *pe=GET_EXCEPTIONS(csp, props); uint16_t excWord=*pe++; int32_t idx; if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { /* special case folding mappings, hardcoded */ if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { /* default mappings */ if(c==0x49) { /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ return 0x69; } else if(c==0x130) { /* no simple case folding for U+0130 */ return c; } } else { /* Turkic mappings */ if(c==0x49) { /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ return 0x131; } else if(c==0x130) { /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ return 0x69; } } } if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { idx=UCASE_EXC_FOLD; } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { idx=UCASE_EXC_LOWER; } else { return c; } GET_SLOT_VALUE(excWord, idx, pe, c); } return c; }
uint32_t CasePropsBuilder::makeExcProps(UChar32 c, uint32_t value, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } if(excPropsCount==MAX_EXC_COUNT) { fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n"); errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } LocalPointer<ExcProps> newExcProps(new ExcProps); if(newExcProps==NULL) { fprintf(stderr, "genprops error: casepropsbuilder out of memory allocating " "exceptions properties\n"); errorCode=U_MEMORY_ALLOCATION_ERROR; return 0; } if((value&UCASE_TYPE_MASK)>UCASE_NONE) { // Decode the simple case mapping. UChar32 next=c+UCASE_GET_DELTA(value); if(next!=c) { UniProps &p=newExcProps->props; if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { p.suc=p.stc=next; } else { p.slc=next; } } } value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); // remove previous simple mapping value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT; value|=UCASE_EXCEPTION; excProps[excPropsCount++]=newExcProps.orphan(); return value; }
U_CAPI int32_t U_EXPORT2 ucase_toFullLower(const UCaseProps *csp, UChar32 c, UCaseContextIterator *iter, void *context, const UChar **pString, const char *locale, int32_t *locCache) { UChar32 result=c; uint16_t props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { result=c+UCASE_GET_DELTA(props); } } else { const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; uint16_t excWord=*pe++; int32_t full; pe2=pe; if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { /* use hardcoded conditions and mappings */ int32_t loc=ucase_getCaseLocale(locale, locCache); /* * Test for conditional mappings first * (otherwise the unconditional default mappings are always taken), * then test for characters that have unconditional mappings in SpecialCasing.txt, * then get the UnicodeData.txt mappings. */ if( loc==UCASE_LOC_LITHUANIAN && /* base characters, find accents above */ (((c==0x49 || c==0x4a || c==0x12e) && isFollowedByMoreAbove(csp, iter, context)) || /* precomposed with accent above, no need to find one */ (c==0xcc || c==0xcd || c==0x128)) ) { /* # Lithuanian # Lithuanian retains the dot in a lowercase i when followed by accents. # Introduce an explicit dot above when lowercasing capital I's and J's # whenever there are more accents above. # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE */ switch(c) { case 0x49: /* LATIN CAPITAL LETTER I */ *pString=iDot; return 2; case 0x4a: /* LATIN CAPITAL LETTER J */ *pString=jDot; return 2; case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ *pString=iOgonekDot; return 2; case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ *pString=iDotGrave; return 3; case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ *pString=iDotAcute; return 3; case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ *pString=iDotTilde; return 3; default: return 0; /* will not occur */ } /* # Turkish and Azeri */ } else if(loc==UCASE_LOC_TURKISH && c==0x130) { /* # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # The following rules handle those cases. 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE */ return 0x69; } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) { /* # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. # This matches the behavior of the canonically equivalent I-dot_above 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE */ return 0; /* remove the dot (continue without output) */ } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) { /* # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I */ return 0x131; } else if(c==0x130) { /* # Preserve canonical equivalence for I with dot. Turkic is handled below. 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ *pString=iDot; return 2; } else if( c==0x3a3 && !isFollowedByCasedLetter(csp, iter, context, 1) && isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */ ) { /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ /* # Special case for final form of sigma 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA */ return 0x3c2; /* greek small final sigma */ } else { /* no known conditional special case mapping, use a normal mapping */ } } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); full&=UCASE_FULL_LOWER; if(full!=0) { /* set the output pointer to the lowercase mapping */ *pString=pe+1; /* return the string length */ return full; } } if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); } } return (result==c) ? ~result : result; }
U_CFUNC void U_EXPORT2 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) { uint16_t props; /* * Hardcode the case closure of i and its relatives and ignore the * data file data for these characters. * The Turkic dotless i and dotted I with their case mapping conditions * and case folding option make the related characters behave specially. * This code matches their closure behavior to their case folding behavior. */ switch(c) { case 0x49: /* regular i and I are in one equivalence class */ sa->add(sa->set, 0x69); return; case 0x69: sa->add(sa->set, 0x49); return; case 0x130: /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ sa->addString(sa->set, iDot, 2); return; case 0x131: /* dotless i is in a class by itself */ return; default: /* otherwise use the data file data */ break; } props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)!=UCASE_NONE) { /* add the one simple case mapping, no matter what type it is */ int32_t delta=UCASE_GET_DELTA(props); if(delta!=0) { sa->add(sa->set, c+delta); } } } else { /* * c has exceptions, so there may be multiple simple and/or * full case mappings. Add them all. */ const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props); const UChar *closure; uint16_t excWord=*pe++; int32_t idx, closureLength, fullLength, length; pe0=pe; /* add all simple case mappings */ for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { if(HAS_SLOT(excWord, idx)) { pe=pe0; GET_SLOT_VALUE(excWord, idx, pe, c); sa->add(sa->set, c); } } /* get the closure string pointer & length */ if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { pe=pe0; GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */ } else { closureLength=0; closure=NULL; } /* add the full case folding */ if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { pe=pe0; GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); /* start of full case mapping strings */ ++pe; fullLength&=0xffff; /* bits 16 and higher are reserved */ /* skip the lowercase result string */ pe+=fullLength&UCASE_FULL_LOWER; fullLength>>=4; /* add the full case folding string */ length=fullLength&0xf; if(length!=0) { sa->addString(sa->set, (const UChar *)pe, length); pe+=length; } /* skip the uppercase and titlecase strings */ fullLength>>=4; pe+=fullLength&0xf; fullLength>>=4; pe+=fullLength; closure=(const UChar *)pe; /* behind full case mappings */ } /* add each code point in the closure string */ for(idx=0; idx<closureLength;) { U16_NEXT_UNSAFE(closure, idx, c); sa->add(sa->set, c); } } }
U_CAPI int32_t U_EXPORT2 ucase_toFullFolding(const UCaseProps *csp, UChar32 c, const UChar **pString, uint32_t options) { UChar32 result=c; uint16_t props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { result=c+UCASE_GET_DELTA(props); } } else { const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; uint16_t excWord=*pe++; int32_t full, idx; pe2=pe; if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { /* use hardcoded conditions and mappings */ if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { /* default mappings */ if(c==0x49) { /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ return 0x69; } else if(c==0x130) { /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ *pString=iDot; return 2; } } else { /* Turkic mappings */ if(c==0x49) { /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ return 0x131; } else if(c==0x130) { /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ return 0x69; } } } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); /* start of full case mapping strings */ ++pe; /* skip the lowercase result string */ pe+=full&UCASE_FULL_LOWER; full=(full>>4)&0xf; if(full!=0) { /* set the output pointer to the result string */ *pString=pe; /* return the string length */ return full; } } if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { idx=UCASE_EXC_FOLD; } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { idx=UCASE_EXC_LOWER; } else { return ~c; } GET_SLOT_VALUE(excWord, idx, pe2, result); } return (result==c) ? ~result : result; }
/* internal */ static int32_t toUpperOrTitle(const UCaseProps *csp, UChar32 c, UCaseContextIterator *iter, void *context, const UChar **pString, const char *locale, int32_t *locCache, UBool upperNotTitle) { UChar32 result=c; uint16_t props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { result=c+UCASE_GET_DELTA(props); } } else { const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; uint16_t excWord=*pe++; int32_t full, idx; pe2=pe; if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { /* use hardcoded conditions and mappings */ int32_t loc=ucase_getCaseLocale(locale, locCache); if(loc==UCASE_LOC_TURKISH && c==0x69) { /* # Turkish and Azeri # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # The following rules handle those cases. # When uppercasing, i turns into a dotted capital I 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I */ return 0x130; } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) { /* # Lithuanian # Lithuanian retains the dot in a lowercase i when followed by accents. # Remove DOT ABOVE after "i" with upper or titlecase 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE */ return 0; /* remove the dot (continue without output) */ } else { /* no known conditional special case mapping, use a normal mapping */ } } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); /* start of full case mapping strings */ ++pe; /* skip the lowercase and case-folding result strings */ pe+=full&UCASE_FULL_LOWER; full>>=4; pe+=full&0xf; full>>=4; if(upperNotTitle) { full&=0xf; } else { /* skip the uppercase result string */ pe+=full&0xf; full=(full>>4)&0xf; } if(full!=0) { /* set the output pointer to the result string */ *pString=pe; /* return the string length */ return full; } } if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { idx=UCASE_EXC_TITLE; } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { /* here, titlecase is same as uppercase */ idx=UCASE_EXC_UPPER; } else { return ~c; } GET_SLOT_VALUE(excWord, idx, pe2, result); } return (result==c) ? ~result : result; }