/** * Greek string uppercasing with a state machine. * Probably simpler than a stateless function that has to figure out complex context-before * for each character. * TODO: Try to re-consolidate one way or another with the non-Greek function. */ int32_t toUpper(const UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { int32_t locCache = UCASE_LOC_GREEK; int32_t destIndex=0; uint32_t state = 0; for (int32_t i = 0; i < srcLength;) { int32_t nextIndex = i; UChar32 c; U16_NEXT(src, nextIndex, srcLength, c); uint32_t nextState = 0; int32_t type = ucase_getTypeOrIgnorable(csm->csp, c); if ((type & UCASE_IGNORABLE) != 0) { // c is case-ignorable nextState |= (state & AFTER_CASED); } else if (type != UCASE_NONE) { // c is cased nextState |= AFTER_CASED; } uint32_t data = getLetterData(c); if (data > 0) { uint32_t upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && (upper == 0x399 || upper == 0x3A5)) { data |= HAS_DIALYTIKA; } int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) { numYpogegrammeni = 1; } // Skip combining diacritics after this Greek letter. while (nextIndex < srcLength) { uint32_t diacriticData = getDiacriticData(src[nextIndex]); if (diacriticData != 0) { data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { ++numYpogegrammeni; } ++nextIndex; } else { break; // not a Greek diacritic } } if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { nextState |= AFTER_VOWEL_WITH_ACCENT; } // Map according to Greek rules. UBool addTonos = FALSE; if (upper == 0x397 && (data & HAS_ACCENT) != 0 && numYpogegrammeni == 0 && (state & AFTER_CASED) == 0 && !isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (i == nextIndex) { upper = 0x389; // Preserve the precomposed form. } else { addTonos = TRUE; } } else if ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 0x399) { upper = 0x3AA; data &= ~HAS_EITHER_DIALYTIKA; } else if (upper == 0x3A5) { upper = 0x3AB; data &= ~HAS_EITHER_DIALYTIKA; } } destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper); if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika } if (destIndex >= 0 && addTonos) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x301); } while (destIndex >= 0 && numYpogegrammeni > 0) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x399); --numYpogegrammeni; } if(destIndex<0) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } else { const UChar *s; UChar32 c2 = 0; c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache); if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { /* fast path version of appendResult() for BMP results */ dest[destIndex++]=(UChar)c2; } else { destIndex=appendResult(dest, destIndex, destCapacity, c, s); if(destIndex<0) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } } i = nextIndex; state = nextState; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java. void toUpper(uint32_t options, const uint8_t *src, int32_t srcLength, ByteSink &sink, Edits *edits, UErrorCode &errorCode) { uint32_t state = 0; for (int32_t i = 0; i < srcLength;) { int32_t nextIndex = i; UChar32 c; U8_NEXT(src, nextIndex, srcLength, c); uint32_t nextState = 0; int32_t type = ucase_getTypeOrIgnorable(c); if ((type & UCASE_IGNORABLE) != 0) { // c is case-ignorable nextState |= (state & AFTER_CASED); } else if (type != UCASE_NONE) { // c is cased nextState |= AFTER_CASED; } uint32_t data = getLetterData(c); if (data > 0) { uint32_t upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && (upper == 0x399 || upper == 0x3A5)) { data |= HAS_DIALYTIKA; } int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) { numYpogegrammeni = 1; } // Skip combining diacritics after this Greek letter. int32_t nextNextIndex = nextIndex; while (nextIndex < srcLength) { UChar32 c2; U8_NEXT(src, nextNextIndex, srcLength, c2); uint32_t diacriticData = getDiacriticData(c2); if (diacriticData != 0) { data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { ++numYpogegrammeni; } nextIndex = nextNextIndex; } else { break; // not a Greek diacritic } } if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { nextState |= AFTER_VOWEL_WITH_ACCENT; } // Map according to Greek rules. UBool addTonos = FALSE; if (upper == 0x397 && (data & HAS_ACCENT) != 0 && numYpogegrammeni == 0 && (state & AFTER_CASED) == 0 && !isFollowedByCasedLetter(src, nextIndex, srcLength)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (i == nextIndex) { upper = 0x389; // Preserve the precomposed form. } else { addTonos = TRUE; } } else if ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 0x399) { upper = 0x3AA; data &= ~HAS_EITHER_DIALYTIKA; } else if (upper == 0x3A5) { upper = 0x3AB; data &= ~HAS_EITHER_DIALYTIKA; } } UBool change; if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) { change = TRUE; // common, simple usage } else { // Find out first whether we are changing the text. U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block change = (i + 2) > nextIndex || src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) || numYpogegrammeni > 0; int32_t i2 = i + 2; if ((data & HAS_EITHER_DIALYTIKA) != 0) { change |= (i2 + 2) > nextIndex || src[i2] != (uint8_t)u8"\u0308"[0] || src[i2 + 1] != (uint8_t)u8"\u0308"[1]; i2 += 2; } if (addTonos) { change |= (i2 + 2) > nextIndex || src[i2] != (uint8_t)u8"\u0301"[0] || src[i2 + 1] != (uint8_t)u8"\u0301"[1]; i2 += 2; } int32_t oldLength = nextIndex - i; int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399 change |= oldLength != newLength; if (change) { if (edits != NULL) { edits->addReplace(oldLength, newLength); } } else { if (edits != NULL) { edits->addUnchanged(oldLength); } // Write unchanged text? change = (options & U_OMIT_UNCHANGED_TEXT) == 0; } } if (change) { ByteSinkUtil::appendTwoBytes(upper, sink); if ((data & HAS_EITHER_DIALYTIKA) != 0) { sink.Append(u8"\u0308", 2); // restore or add a dialytika } if (addTonos) { sink.Append(u8"\u0301", 2); } while (numYpogegrammeni > 0) { sink.Append(u8"\u0399", 2); --numYpogegrammeni; } } } else if(c>=0) { const UChar *s; c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK); if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) { return; } } else { // Malformed UTF-8. if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i, sink, options, edits, errorCode)) { return; } } i = nextIndex; state = nextState; } }