/** * Greek string uppercasing with a state machine. * Probably simpler than a stateless function that has to figure out complex context-before * for each character. * TODO: Try to re-consolidate one way or another with the non-Greek function. */ int32_t toUpper(const UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { int32_t locCache = UCASE_LOC_GREEK; int32_t destIndex=0; uint32_t state = 0; for (int32_t i = 0; i < srcLength;) { int32_t nextIndex = i; UChar32 c; U16_NEXT(src, nextIndex, srcLength, c); uint32_t nextState = 0; int32_t type = ucase_getTypeOrIgnorable(csm->csp, c); if ((type & UCASE_IGNORABLE) != 0) { // c is case-ignorable nextState |= (state & AFTER_CASED); } else if (type != UCASE_NONE) { // c is cased nextState |= AFTER_CASED; } uint32_t data = getLetterData(c); if (data > 0) { uint32_t upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && (upper == 0x399 || upper == 0x3A5)) { data |= HAS_DIALYTIKA; } int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) { numYpogegrammeni = 1; } // Skip combining diacritics after this Greek letter. while (nextIndex < srcLength) { uint32_t diacriticData = getDiacriticData(src[nextIndex]); if (diacriticData != 0) { data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { ++numYpogegrammeni; } ++nextIndex; } else { break; // not a Greek diacritic } } if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { nextState |= AFTER_VOWEL_WITH_ACCENT; } // Map according to Greek rules. UBool addTonos = FALSE; if (upper == 0x397 && (data & HAS_ACCENT) != 0 && numYpogegrammeni == 0 && (state & AFTER_CASED) == 0 && !isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (i == nextIndex) { upper = 0x389; // Preserve the precomposed form. } else { addTonos = TRUE; } } else if ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 0x399) { upper = 0x3AA; data &= ~HAS_EITHER_DIALYTIKA; } else if (upper == 0x3A5) { upper = 0x3AB; data &= ~HAS_EITHER_DIALYTIKA; } } destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper); if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika } if (destIndex >= 0 && addTonos) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x301); } while (destIndex >= 0 && numYpogegrammeni > 0) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x399); --numYpogegrammeni; } if(destIndex<0) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } else { const UChar *s; UChar32 c2 = 0; c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache); if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { /* fast path version of appendResult() for BMP results */ dest[destIndex++]=(UChar)c2; } else { destIndex=appendResult(dest, destIndex, destCapacity, c, s); if(destIndex<0) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } } i = nextIndex; state = nextState; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
U_CAPI int32_t U_EXPORT2 ucase_toFullLower(const UCaseProps *csp, UChar32 c, UCaseContextIterator *iter, void *context, const UChar **pString, const char *locale, int32_t *locCache) { UChar32 result=c; uint16_t props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { result=c+UCASE_GET_DELTA(props); } } else { const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; uint16_t excWord=*pe++; int32_t full; pe2=pe; if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { /* use hardcoded conditions and mappings */ int32_t loc=ucase_getCaseLocale(locale, locCache); /* * Test for conditional mappings first * (otherwise the unconditional default mappings are always taken), * then test for characters that have unconditional mappings in SpecialCasing.txt, * then get the UnicodeData.txt mappings. */ if( loc==UCASE_LOC_LITHUANIAN && /* base characters, find accents above */ (((c==0x49 || c==0x4a || c==0x12e) && isFollowedByMoreAbove(csp, iter, context)) || /* precomposed with accent above, no need to find one */ (c==0xcc || c==0xcd || c==0x128)) ) { /* # Lithuanian # Lithuanian retains the dot in a lowercase i when followed by accents. # Introduce an explicit dot above when lowercasing capital I's and J's # whenever there are more accents above. # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE */ switch(c) { case 0x49: /* LATIN CAPITAL LETTER I */ *pString=iDot; return 2; case 0x4a: /* LATIN CAPITAL LETTER J */ *pString=jDot; return 2; case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ *pString=iOgonekDot; return 2; case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ *pString=iDotGrave; return 3; case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ *pString=iDotAcute; return 3; case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ *pString=iDotTilde; return 3; default: return 0; /* will not occur */ } /* # Turkish and Azeri */ } else if(loc==UCASE_LOC_TURKISH && c==0x130) { /* # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # The following rules handle those cases. 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE */ return 0x69; } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) { /* # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. # This matches the behavior of the canonically equivalent I-dot_above 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE */ return 0; /* remove the dot (continue without output) */ } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) { /* # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I */ return 0x131; } else if(c==0x130) { /* # Preserve canonical equivalence for I with dot. Turkic is handled below. 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ *pString=iDot; return 2; } else if( c==0x3a3 && !isFollowedByCasedLetter(csp, iter, context, 1) && isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */ ) { /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ /* # Special case for final form of sigma 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA */ return 0x3c2; /* greek small final sigma */ } else { /* no known conditional special case mapping, use a normal mapping */ } } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); full&=UCASE_FULL_LOWER; if(full!=0) { /* set the output pointer to the lowercase mapping */ *pString=pe+1; /* return the string length */ return full; } } if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); } } return (result==c) ? ~result : result; }
// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java. void toUpper(uint32_t options, const uint8_t *src, int32_t srcLength, ByteSink &sink, Edits *edits, UErrorCode &errorCode) { uint32_t state = 0; for (int32_t i = 0; i < srcLength;) { int32_t nextIndex = i; UChar32 c; U8_NEXT(src, nextIndex, srcLength, c); uint32_t nextState = 0; int32_t type = ucase_getTypeOrIgnorable(c); if ((type & UCASE_IGNORABLE) != 0) { // c is case-ignorable nextState |= (state & AFTER_CASED); } else if (type != UCASE_NONE) { // c is cased nextState |= AFTER_CASED; } uint32_t data = getLetterData(c); if (data > 0) { uint32_t upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && (upper == 0x399 || upper == 0x3A5)) { data |= HAS_DIALYTIKA; } int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) { numYpogegrammeni = 1; } // Skip combining diacritics after this Greek letter. int32_t nextNextIndex = nextIndex; while (nextIndex < srcLength) { UChar32 c2; U8_NEXT(src, nextNextIndex, srcLength, c2); uint32_t diacriticData = getDiacriticData(c2); if (diacriticData != 0) { data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { ++numYpogegrammeni; } nextIndex = nextNextIndex; } else { break; // not a Greek diacritic } } if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { nextState |= AFTER_VOWEL_WITH_ACCENT; } // Map according to Greek rules. UBool addTonos = FALSE; if (upper == 0x397 && (data & HAS_ACCENT) != 0 && numYpogegrammeni == 0 && (state & AFTER_CASED) == 0 && !isFollowedByCasedLetter(src, nextIndex, srcLength)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (i == nextIndex) { upper = 0x389; // Preserve the precomposed form. } else { addTonos = TRUE; } } else if ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 0x399) { upper = 0x3AA; data &= ~HAS_EITHER_DIALYTIKA; } else if (upper == 0x3A5) { upper = 0x3AB; data &= ~HAS_EITHER_DIALYTIKA; } } UBool change; if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) { change = TRUE; // common, simple usage } else { // Find out first whether we are changing the text. U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block change = (i + 2) > nextIndex || src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) || numYpogegrammeni > 0; int32_t i2 = i + 2; if ((data & HAS_EITHER_DIALYTIKA) != 0) { change |= (i2 + 2) > nextIndex || src[i2] != (uint8_t)u8"\u0308"[0] || src[i2 + 1] != (uint8_t)u8"\u0308"[1]; i2 += 2; } if (addTonos) { change |= (i2 + 2) > nextIndex || src[i2] != (uint8_t)u8"\u0301"[0] || src[i2 + 1] != (uint8_t)u8"\u0301"[1]; i2 += 2; } int32_t oldLength = nextIndex - i; int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399 change |= oldLength != newLength; if (change) { if (edits != NULL) { edits->addReplace(oldLength, newLength); } } else { if (edits != NULL) { edits->addUnchanged(oldLength); } // Write unchanged text? change = (options & U_OMIT_UNCHANGED_TEXT) == 0; } } if (change) { ByteSinkUtil::appendTwoBytes(upper, sink); if ((data & HAS_EITHER_DIALYTIKA) != 0) { sink.Append(u8"\u0308", 2); // restore or add a dialytika } if (addTonos) { sink.Append(u8"\u0301", 2); } while (numYpogegrammeni > 0) { sink.Append(u8"\u0399", 2); --numYpogegrammeni; } } } else if(c>=0) { const UChar *s; c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK); if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) { return; } } else { // Malformed UTF-8. if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i, sink, options, edits, errorCode)) { return; } } i = nextIndex; state = nextState; } }