Example #1
0
/*----------------------------------------------------------------------------------------------
	Convert the Graphite character offset to the decomposed NFD
	character offset used internally by views code.
----------------------------------------------------------------------------------------------*/
int FwGrTxtSrc::GrToVwOffset(int grOffset)
{
	if (!m_useNFC)
	{
		// the Graphite offset is a NFD offset
		return grOffset;
	}
	else
	{
		// convert NFC offsets to internal NFD offsets
		if (grOffset == 0)
			return 0;

		HRESULT hr;
		int cch;
		IgnoreHr(hr = m_qts->get_Length(&cch));
		if (FAILED(hr))
			throw;

		if (grOffset > cch)
			// grOffset points beyond the available text, i.e. is invalid.
			return cch + 10; // arbitrary number that is bigger than NFD text

		StrUni stuNfd;
		wchar_t* pchNfd;
		stuNfd.SetSize(cch + 1, &pchNfd);
		IgnoreHr(hr = m_qts->Fetch(0, cch, pchNfd));
		if (FAILED(hr))
			throw;
		pchNfd[cch] = '\0';

		wchar_t szOut[kNFDBufferSize];
		UCharIterator iter;
		uiter_setString(&iter, pchNfd, -1);
		int curGrOffset = 0;
		while (iter.hasNext(&iter))
		{
			int index = iter.getIndex(&iter, UITER_CURRENT);
			if (curGrOffset >= grOffset)
				return index;
			UBool neededToNormalize;
			UErrorCode uerr = U_ZERO_ERROR;
			int outLen = unorm_next(&iter, szOut, kNFDBufferSize, UNORM_NFC, 0, TRUE, &neededToNormalize, &uerr);
			Assert(U_SUCCESS(uerr));
			curGrOffset++;
			for (int i = 1; i < outLen; i++)
			{
				if (curGrOffset >= grOffset)
					return index + i;
				curGrOffset++;
			}
		}
		return iter.getIndex(&iter, UITER_CURRENT);
	}
}
Example #2
0
int FwGrTxtSrc::VwToGrOffset(int vwOffset, bool& badOffset)
{
	badOffset = false;
	if (!m_useNFC)
	{
		// the NFD offset is a Graphite offset
		return vwOffset;
	}
	else
	{
		// convert internal NFD offsets to NFC offsets
		if (vwOffset == 0)
			return 0;

		HRESULT hr;
		int cch;
		IgnoreHr(hr = m_qts->get_Length(&cch));
		if (FAILED(hr))
			throw;
		if (vwOffset > cch)
			return vwOffset;

		StrUni stuNfd;
		wchar_t* pchNfd;
		stuNfd.SetSize(cch + 1, &pchNfd);
		IgnoreHr(hr = m_qts->Fetch(0, cch, pchNfd));
		if (FAILED(hr))
			throw;
		pchNfd[cch] = '\0';

		wchar_t szOut[kNFDBufferSize];
		UCharIterator iter;
		uiter_setString(&iter, pchNfd, -1);
		int curGrOffset = 0;
		while (iter.hasNext(&iter))
		{
			int index = iter.getIndex(&iter, UITER_CURRENT);
			UBool neededToNormalize;
			UErrorCode uerr = U_ZERO_ERROR;
			int outLen = unorm_next(&iter, szOut, kNFDBufferSize, UNORM_NFC, 0, TRUE, &neededToNormalize, &uerr);
			Assert(U_SUCCESS(uerr));
			for (int i = 0; i < outLen; i++)
			{
				if (index + i + 1 > vwOffset)
					return curGrOffset;
				curGrOffset++;
			}
			if (neededToNormalize && iter.getIndex(&iter, UITER_CURRENT) > vwOffset)
				badOffset = true;
		}
		return curGrOffset;
	}
}
Example #3
0
char *
Unicode_Normalize(const char *str,               // IN
                  UnicodeNormalizationForm form) // IN
{
   UNormalizationMode mode;
   UChar *uchars;
   char *result;
   int32_t normalizedLen;
   UErrorCode status = U_ZERO_ERROR;
   UCharIterator strIter;
   UBool neededToNormalize = FALSE;

   uiter_setUTF8(&strIter, (const char *)str, -1);

   switch (form) {
   case UNICODE_NORMAL_FORM_C:
      mode = UNORM_NFC;
      break;
   case UNICODE_NORMAL_FORM_D:
      mode = UNORM_NFD;
      break;
   default:
      NOT_REACHED();
   }

   normalizedLen = unorm_next(&strIter,
                              NULL,
                              0,
                              mode,
                              0,
                              TRUE,
                              &neededToNormalize,
                              &status);

   if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
      // We expect U_BUFFER_OVERFLOW_ERROR here. Anything else is a problem.
      ASSERT(U_SUCCESS(status));
      return NULL;
   }

   uchars = Util_SafeMalloc(sizeof *uchars * normalizedLen);

   // Reset back to the beginning of the UTF-8 input.
   (*strIter.move)(&strIter, 0, UITER_START);

   status = U_ZERO_ERROR;
   normalizedLen = unorm_next(&strIter,
                              uchars,
                              normalizedLen,
                              mode,
                              0,
                              TRUE,
                              &neededToNormalize,
                              &status);

   if (U_FAILURE(status)) {
      ASSERT(U_SUCCESS(status));
      return NULL;
   }

   result = Unicode_AllocWithLength(uchars,
                                    normalizedLen * 2,
                                    STRING_ENCODING_UTF16);
   free(uchars);

   return result;
}
Example #4
0
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                      UBool isIncremental) const {
    // start and limit of the input range
    int32_t start = offsets.start;
    int32_t limit = offsets.limit;
    int32_t length, delta;

    if(start >= limit) {
        return;
    }

    // a C code unit iterator, implemented around the Replaceable
    UCharIterator iter;
    uiter_setReplaceable(&iter, &text);

    // the output string and buffer pointer
    UnicodeString output;
    UChar *buffer;
    UBool neededToNormalize;

    UErrorCode errorCode;

    /*
     * Normalize as short chunks at a time as possible even in
     * bulk mode, so that styled text is minimally disrupted.
     * In incremental mode, a chunk that ends with offsets.limit
     * must not be normalized.
     *
     * If it was known that the input text is not styled, then
     * a bulk mode normalization could look like this:
     *

    UChar staticChars[256];
    UnicodeString input;

    length = limit - start;
    input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias

    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    input.releaseBuffer(length);

    UErrorCode status = U_ZERO_ERROR;
    Normalizer::normalize(input, fMode, options, output, status);

    text.handleReplaceBetween(start, limit, output);

    int32_t delta = output.length() - length;
    offsets.contextLimit += delta;
    offsets.limit += delta;
    offsets.start = limit + delta;

     *
     */
    while(start < limit) {
        // set the iterator limits for the remaining input range
        // this is a moving target because of the replacements in the text object
        iter.start = iter.index = start;
        iter.limit = limit;

        // incrementally normalize a small chunk of the input
        buffer = output.getBuffer(-1);
        errorCode = U_ZERO_ERROR;
        length = unorm_next(&iter, buffer, output.getCapacity(),
                            fMode, 0,
                            TRUE, &neededToNormalize,
                            &errorCode);
        output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);

        if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
            // use a larger output string buffer and do it again from the start
            iter.index = start;
            buffer = output.getBuffer(length);
            errorCode = U_ZERO_ERROR;
            length = unorm_next(&iter, buffer, output.getCapacity(),
                                fMode, 0,
                                TRUE, &neededToNormalize,
                                &errorCode);
            output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
        }

        if(U_FAILURE(errorCode)) {
            break;
        }

        limit = iter.index;
        if(isIncremental && limit == iter.limit) {
            // stop in incremental mode when we reach the input limit
            // in case there are additional characters that could change the
            // normalization result

            // UNLESS all characters in the result of the normalization of
            // the last run are in the skippable set
            const UChar *s=output.getBuffer();
            int32_t i=0, outLength=output.length();
            UChar32 c;

            while(i<outLength) {
                U16_NEXT(s, i, outLength, c);
                if(!unorm_isNFSkippable(c, fMode)) {
                    outLength=-1; // I wish C++ had labeled loops and break outer; ...
                    break;
                }
            }
            if (outLength<0) {
                break;
            }
        }

        if(neededToNormalize) {
            // replace the input chunk with its normalized form
            text.handleReplaceBetween(start, limit, output);

            // update all necessary indexes accordingly
            delta = length - (limit - start);   // length change in the text object
            start = limit += delta;             // the next chunk starts where this one ends, with adjustment
            limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
            offsets.contextLimit += delta;
        } else {
            // delta == 0
            start = limit;
            limit = offsets.limit;
        }
    }

    offsets.start = start;
}
int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size,
        UBool * isError)
{
  if (size < MIN_OUTPUT_SIZE) {
    *isError = TRUE;
    return 0;
  }

  *isError = FALSE;

  // Normalize the first character to remove accents using the NFD normalization
  UErrorCode errorCode = U_ZERO_ERROR;
  int32_t len = unorm_next(iter, out, size, UNORM_NFD,
          0 /* options */, TRUE /* normalize */, NULL, &errorCode);
  if (U_FAILURE(errorCode)) {
    *isError = TRUE;
    return 0;
  }

  if (len == 0) {   // Empty input string
    return 0;
  }

  UChar c = out[0];

  // We are only interested in letters
  if (!u_isalpha(c)) {
    return 0;
  }

  c = u_toupper(c);

  // Check for explicitly mapped characters
  UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
  if (c_mapped != 0) {
    out[0] = c_mapped;
    return 1;
  }

  // Convert Kanas to Hiragana
  UChar next = len > 2 ? out[1] : 0;
  c = android::GetNormalizedCodePoint(c, next, NULL);

  // Traditional grouping of Hiragana characters
  if (0x3042 <= c && c <= 0x309F) {
    if (c < 0x304B) c = 0x3042;         // a
    else if (c < 0x3055) c = 0x304B;    // ka
    else if (c < 0x305F) c = 0x3055;    // sa
    else if (c < 0x306A) c = 0x305F;    // ta
    else if (c < 0x306F) c = 0x306A;    // na
    else if (c < 0x307E) c = 0x306F;    // ha
    else if (c < 0x3084) c = 0x307E;    // ma
    else if (c < 0x3089) c = 0x3084;    // ya
    else if (c < 0x308F) c = 0x3089;    // ra
    else c = 0x308F;                    // wa
    out[0] = c;
    return 1;
  }

  if (is_CJK(c)) {
    if (strncmp(locale, "ja", 2) == 0) {
      // Japanese word meaning "misc" or "other"
      out[0] = 0x4ED6;
      return 1;
    } else {
      return 0;
    }
  }

  out[0] = c;
  return 1;
}