/*---------------------------------------------------------------------------------------------- Convert the Graphite character offset to the decomposed NFD character offset used internally by views code. ----------------------------------------------------------------------------------------------*/ int FwGrTxtSrc::GrToVwOffset(int grOffset) { if (!m_useNFC) { // the Graphite offset is a NFD offset return grOffset; } else { // convert NFC offsets to internal NFD offsets if (grOffset == 0) return 0; HRESULT hr; int cch; IgnoreHr(hr = m_qts->get_Length(&cch)); if (FAILED(hr)) throw; if (grOffset > cch) // grOffset points beyond the available text, i.e. is invalid. return cch + 10; // arbitrary number that is bigger than NFD text StrUni stuNfd; wchar_t* pchNfd; stuNfd.SetSize(cch + 1, &pchNfd); IgnoreHr(hr = m_qts->Fetch(0, cch, pchNfd)); if (FAILED(hr)) throw; pchNfd[cch] = '\0'; wchar_t szOut[kNFDBufferSize]; UCharIterator iter; uiter_setString(&iter, pchNfd, -1); int curGrOffset = 0; while (iter.hasNext(&iter)) { int index = iter.getIndex(&iter, UITER_CURRENT); if (curGrOffset >= grOffset) return index; UBool neededToNormalize; UErrorCode uerr = U_ZERO_ERROR; int outLen = unorm_next(&iter, szOut, kNFDBufferSize, UNORM_NFC, 0, TRUE, &neededToNormalize, &uerr); Assert(U_SUCCESS(uerr)); curGrOffset++; for (int i = 1; i < outLen; i++) { if (curGrOffset >= grOffset) return index + i; curGrOffset++; } } return iter.getIndex(&iter, UITER_CURRENT); } }
int FwGrTxtSrc::VwToGrOffset(int vwOffset, bool& badOffset) { badOffset = false; if (!m_useNFC) { // the NFD offset is a Graphite offset return vwOffset; } else { // convert internal NFD offsets to NFC offsets if (vwOffset == 0) return 0; HRESULT hr; int cch; IgnoreHr(hr = m_qts->get_Length(&cch)); if (FAILED(hr)) throw; if (vwOffset > cch) return vwOffset; StrUni stuNfd; wchar_t* pchNfd; stuNfd.SetSize(cch + 1, &pchNfd); IgnoreHr(hr = m_qts->Fetch(0, cch, pchNfd)); if (FAILED(hr)) throw; pchNfd[cch] = '\0'; wchar_t szOut[kNFDBufferSize]; UCharIterator iter; uiter_setString(&iter, pchNfd, -1); int curGrOffset = 0; while (iter.hasNext(&iter)) { int index = iter.getIndex(&iter, UITER_CURRENT); UBool neededToNormalize; UErrorCode uerr = U_ZERO_ERROR; int outLen = unorm_next(&iter, szOut, kNFDBufferSize, UNORM_NFC, 0, TRUE, &neededToNormalize, &uerr); Assert(U_SUCCESS(uerr)); for (int i = 0; i < outLen; i++) { if (index + i + 1 > vwOffset) return curGrOffset; curGrOffset++; } if (neededToNormalize && iter.getIndex(&iter, UITER_CURRENT) > vwOffset) badOffset = true; } return curGrOffset; } }
char * Unicode_Normalize(const char *str, // IN UnicodeNormalizationForm form) // IN { UNormalizationMode mode; UChar *uchars; char *result; int32_t normalizedLen; UErrorCode status = U_ZERO_ERROR; UCharIterator strIter; UBool neededToNormalize = FALSE; uiter_setUTF8(&strIter, (const char *)str, -1); switch (form) { case UNICODE_NORMAL_FORM_C: mode = UNORM_NFC; break; case UNICODE_NORMAL_FORM_D: mode = UNORM_NFD; break; default: NOT_REACHED(); } normalizedLen = unorm_next(&strIter, NULL, 0, mode, 0, TRUE, &neededToNormalize, &status); if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { // We expect U_BUFFER_OVERFLOW_ERROR here. Anything else is a problem. ASSERT(U_SUCCESS(status)); return NULL; } uchars = Util_SafeMalloc(sizeof *uchars * normalizedLen); // Reset back to the beginning of the UTF-8 input. (*strIter.move)(&strIter, 0, UITER_START); status = U_ZERO_ERROR; normalizedLen = unorm_next(&strIter, uchars, normalizedLen, mode, 0, TRUE, &neededToNormalize, &status); if (U_FAILURE(status)) { ASSERT(U_SUCCESS(status)); return NULL; } result = Unicode_AllocWithLength(uchars, normalizedLen * 2, STRING_ENCODING_UTF16); free(uchars); return result; }
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // start and limit of the input range int32_t start = offsets.start; int32_t limit = offsets.limit; int32_t length, delta; if(start >= limit) { return; } // a C code unit iterator, implemented around the Replaceable UCharIterator iter; uiter_setReplaceable(&iter, &text); // the output string and buffer pointer UnicodeString output; UChar *buffer; UBool neededToNormalize; UErrorCode errorCode; /* * Normalize as short chunks at a time as possible even in * bulk mode, so that styled text is minimally disrupted. * In incremental mode, a chunk that ends with offsets.limit * must not be normalized. * * If it was known that the input text is not styled, then * a bulk mode normalization could look like this: * UChar staticChars[256]; UnicodeString input; length = limit - start; input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); input.releaseBuffer(length); UErrorCode status = U_ZERO_ERROR; Normalizer::normalize(input, fMode, options, output, status); text.handleReplaceBetween(start, limit, output); int32_t delta = output.length() - length; offsets.contextLimit += delta; offsets.limit += delta; offsets.start = limit + delta; * */ while(start < limit) { // set the iterator limits for the remaining input range // this is a moving target because of the replacements in the text object iter.start = iter.index = start; iter.limit = limit; // incrementally normalize a small chunk of the input buffer = output.getBuffer(-1); errorCode = U_ZERO_ERROR; length = unorm_next(&iter, buffer, output.getCapacity(), fMode, 0, TRUE, &neededToNormalize, &errorCode); output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); if(errorCode == U_BUFFER_OVERFLOW_ERROR) { // use a larger output string buffer and do it again from the start iter.index = start; buffer = output.getBuffer(length); errorCode = U_ZERO_ERROR; length = unorm_next(&iter, buffer, output.getCapacity(), fMode, 0, TRUE, &neededToNormalize, &errorCode); output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); } if(U_FAILURE(errorCode)) { break; } limit = iter.index; if(isIncremental && limit == iter.limit) { // stop in incremental mode when we reach the input limit // in case there are additional characters that could change the // normalization result // UNLESS all characters in the result of the normalization of // the last run are in the skippable set const UChar *s=output.getBuffer(); int32_t i=0, outLength=output.length(); UChar32 c; while(i<outLength) { U16_NEXT(s, i, outLength, c); if(!unorm_isNFSkippable(c, fMode)) { outLength=-1; // I wish C++ had labeled loops and break outer; ... break; } } if (outLength<0) { break; } } if(neededToNormalize) { // replace the input chunk with its normalized form text.handleReplaceBetween(start, limit, output); // update all necessary indexes accordingly delta = length - (limit - start); // length change in the text object start = limit += delta; // the next chunk starts where this one ends, with adjustment limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range offsets.contextLimit += delta; } else { // delta == 0 start = limit; limit = offsets.limit; } } offsets.start = start; }
int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size, UBool * isError) { if (size < MIN_OUTPUT_SIZE) { *isError = TRUE; return 0; } *isError = FALSE; // Normalize the first character to remove accents using the NFD normalization UErrorCode errorCode = U_ZERO_ERROR; int32_t len = unorm_next(iter, out, size, UNORM_NFD, 0 /* options */, TRUE /* normalize */, NULL, &errorCode); if (U_FAILURE(errorCode)) { *isError = TRUE; return 0; } if (len == 0) { // Empty input string return 0; } UChar c = out[0]; // We are only interested in letters if (!u_isalpha(c)) { return 0; } c = u_toupper(c); // Check for explicitly mapped characters UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar)); if (c_mapped != 0) { out[0] = c_mapped; return 1; } // Convert Kanas to Hiragana UChar next = len > 2 ? out[1] : 0; c = android::GetNormalizedCodePoint(c, next, NULL); // Traditional grouping of Hiragana characters if (0x3042 <= c && c <= 0x309F) { if (c < 0x304B) c = 0x3042; // a else if (c < 0x3055) c = 0x304B; // ka else if (c < 0x305F) c = 0x3055; // sa else if (c < 0x306A) c = 0x305F; // ta else if (c < 0x306F) c = 0x306A; // na else if (c < 0x307E) c = 0x306F; // ha else if (c < 0x3084) c = 0x307E; // ma else if (c < 0x3089) c = 0x3084; // ya else if (c < 0x308F) c = 0x3089; // ra else c = 0x308F; // wa out[0] = c; return 1; } if (is_CJK(c)) { if (strncmp(locale, "ja", 2) == 0) { // Japanese word meaning "misc" or "other" out[0] = 0x4ED6; return 1; } else { return 0; } } out[0] = c; return 1; }