Beispiel #1
0
static int icu_normalizer_normalize(lua_State *L) {
	const UChar* source = icu4lua_checkustring(L, 1, NORMALIZER_UV_USTRING_META);
	int32_t sourceLength = (int32_t)icu4lua_ustrlen(L, 1);
	UNormalizationMode mode;
	UChar* result;
	int32_t resultLength;
	UErrorCode status;
	int32_t options;

	mode = modes[luaL_checkoption(L, 2, DEFAULT_MODE_OPTION, modeNames)];
	options = (int32_t)luaL_optnumber(L,3,0);

	status = U_ZERO_ERROR;
	resultLength = unorm_normalize(source, sourceLength, mode, options, NULL, 0, &status);
	if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) {
		lua_pushnil(L);
		lua_pushstring(L, u_errorName(status));
		return 2;
	}

	result = (UChar*)malloc(sizeof(UChar) * resultLength);
	status = U_ZERO_ERROR;
	unorm_normalize(source, sourceLength, mode, options, result, resultLength, &status);
	if (U_FAILURE(status)) {
		free(result);
		lua_pushnil(L);
		lua_pushstring(L, u_errorName(status));
		return 2;
	}

	icu4lua_pushustring(L, result, resultLength, NORMALIZER_UV_USTRING_META, NORMALIZER_UV_USTRING_POOL);
	free(result);
	return 1;
}
Beispiel #2
0
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(QT4_UNICODE)
    QString str(reinterpret_cast<const QChar*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#elif USE(GLIB_UNICODE)
    GOwnPtr<char> UTF8Source;
    UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0));
    if (!UTF8Source) {
        // If conversion to UTF-8 failed, try with the string without normalization
        return newTextCodec(*this)->encode(characters, length, handling);
    }

    GOwnPtr<char> UTF8Normalized;
    UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC));

    long UTF16Length;
    GOwnPtr<UChar> UTF16Normalized;
    UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0));

    return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling);
#elif OS(WINCE)
    // normalization will be done by Windows CE API
    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
#endif
}
Beispiel #3
0
void utf16_normalize(UChar **target, int32_t *target_len, const UChar *src, int32_t src_len, UNormalizationMode nm, UErrorCode *status)
{
    *status = U_ZERO_ERROR;
    if (nm < UNORM_NONE || nm >= UNORM_MODE_COUNT) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    if (UNORM_NONE == nm) {
        *target = (UChar *) src;
        *target_len = src_len;
        return;
    }
    *target_len = unorm_normalize(src, src_len, nm, 0, NULL, 0, status);
    if (U_BUFFER_OVERFLOW_ERROR != *status) {
        return;
    }
    *status = U_ZERO_ERROR;
    *target = mem_new_n(**target, *target_len + 1);
    /* *target_len = */unorm_normalize(src, src_len, nm, 0, *target, *target_len + 1, status);
    if (U_FAILURE(*status)) {
        efree(*target);
        *target = NULL;
        *target_len = 0;
    } else {
        *(*target + *target_len) = '\0';
        assert(U_ZERO_ERROR == *status);
    }
}
Beispiel #4
0
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(QT4_UNICODE)
    QString str(reinterpret_cast<const QChar*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#elif USE(CASQT_UNICODE) // FIXME:CASQT
#if 0 // 暂不执行normalized,该函数效率太低
    QString str(reinterpret_cast<const ushort*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
#else
	return newTextCodec(*this)->encode(characters, length, handling);
#endif
#endif
}
bool SimpleFontData::canRenderCombiningCharacterSequence(
    const UChar* characters,
    size_t length) const {
  if (!m_combiningCharacterSequenceSupport)
    m_combiningCharacterSequenceSupport = adoptPtr(new HashMap<String, bool>);

  WTF::HashMap<String, bool>::AddResult addResult =
      m_combiningCharacterSequenceSupport->add(String(characters, length),
                                               false);
  if (!addResult.isNewEntry)
    return addResult.storedValue->value;

  UErrorCode error = U_ZERO_ERROR;
  Vector<UChar, 4> normalizedCharacters(length);
  int32_t normalizedLength =
      unorm_normalize(characters, length, UNORM_NFC, UNORM_UNICODE_3_2,
                      &normalizedCharacters[0], length, &error);
  // Can't render if we have an error or no composition occurred.
  if (U_FAILURE(error) || (static_cast<size_t>(normalizedLength) == length))
    return false;

  SkPaint paint;
  m_platformData.setupPaint(&paint);
  paint.setTextEncoding(SkPaint::kUTF16_TextEncoding);
  if (paint.textToGlyphs(&normalizedCharacters[0], normalizedLength * 2, 0)) {
    addResult.storedValue->value = true;
    return true;
  }
  return false;
}
UChar32 SurrogatePairAwareTextIterator::normalizeVoicingMarks()
{
    // According to http://www.unicode.org/Public/UNIDATA/UCD.html#Canonical_Combining_Class_Values
    static const uint8_t hiraganaKatakanaVoicingMarksCombiningClass = 8;

    if (m_currentCharacter + 1 >= m_endCharacter)
        return 0;

    if (combiningClass(m_characters[1]) == hiraganaKatakanaVoicingMarksCombiningClass) {
#if USE(ICU_UNICODE)
        // Normalize into composed form using 3.2 rules.
        UChar normalizedCharacters[2] = { 0, 0 };
        UErrorCode uStatus = U_ZERO_ERROR;  
        int32_t resultLength = unorm_normalize(m_characters, 2, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], 2, &uStatus);
        if (resultLength == 1 && !uStatus)
            return normalizedCharacters[0];
#elif USE(QT4_UNICODE)
        QString tmp(reinterpret_cast<const QChar*>(m_characters), 2);
        QString res = tmp.normalized(QString::NormalizationForm_C, QChar::Unicode_3_2);
        if (res.length() == 1)
            return res.at(0).unicode();
#endif
    }

    return 0;
}
Beispiel #7
0
bool SimpleFontData::canRenderCombiningCharacterSequence(const UChar* characters, size_t length) const
{
    if (!m_combiningCharacterSequenceSupport)
        m_combiningCharacterSequenceSupport = adoptPtr(new HashMap<String, bool>);

    WTF::HashMap<String, bool>::AddResult addResult = m_combiningCharacterSequenceSupport->add(String(characters, length), false);
    if (!addResult.isNewEntry)
        return addResult.iterator->value;

    UErrorCode error = U_ZERO_ERROR;
    Vector<UChar, 4> normalizedCharacters(length);
    int32_t normalizedLength = unorm_normalize(characters, length, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], length, &error);
    // Can't render if we have an error or no composition occurred.
    if (U_FAILURE(error) || (static_cast<size_t>(normalizedLength) == length))
        return false;

    FT_Face face = cairo_ft_scaled_font_lock_face(m_platformData.scaledFont());
    if (!face)
        return false;

    if (FcFreeTypeCharIndex(face, normalizedCharacters[0]))
        addResult.iterator->value = true;

    cairo_ft_scaled_font_unlock_face(m_platformData.scaledFont());
    return addResult.iterator->value;
}
bool SimpleFontData::canRenderCombiningCharacterSequence(const UChar* characters, size_t length) const
{
    if (!m_combiningCharacterSequenceSupport)
        m_combiningCharacterSequenceSupport = adoptPtr(new HashMap<String, bool>);

    WTF::HashMap<String, bool>::AddResult addResult = m_combiningCharacterSequenceSupport->add(String(characters, length), false);
    if (!addResult.isNewEntry)
        return addResult.iterator->value;

    UErrorCode error = U_ZERO_ERROR;
    Vector<UChar, 4> normalizedCharacters(length);
    int32_t normalizedLength = unorm_normalize(characters, length, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], length, &error);
    if (U_FAILURE(error))
        return false;

    int position = 0;
    while (position < normalizedLength) {
        UChar32 character;
        int nextPosition = position;
        U16_NEXT(normalizedCharacters, nextPosition, normalizedLength, character);

        if (!u_hasBinaryProperty(character, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
            FS_USHORT glyph = FS_map_char(m_platformData.font(), static_cast<FS_ULONG>(character));
            if (!glyph)
                return false;
        }

        position = nextPosition;
    }

    addResult.iterator->value = true;
    return true;
}
Beispiel #9
0
int32_t __hs_unorm_normalize(const UChar *source, int32_t sourceLength,
			     UNormalizationMode mode, int32_t options,
			     UChar *result, int32_t resultLength,
			     UErrorCode *status)
{
    return unorm_normalize(source, sourceLength, mode, options, result,
			   resultLength, status);
}
Beispiel #10
0
 ToNFC(const UTrie2PerfTest &testcase) : Command(testcase) {
     UErrorCode errorCode=U_ZERO_ERROR;
     destCapacity=unorm_normalize(testcase.getBuffer(), testcase.getBufferLen(),
                                  UNORM_NFC, 0,
                                  NULL, 0,
                                  &errorCode);
     dest=new UChar[destCapacity];
 }
Beispiel #11
0
void NormalizerPerformanceTest::normalizeInput(ULine* dest,const UChar* src ,int32_t srcLen,UNormalizationMode mode, int32_t options){
    int32_t reqLen = 0;
    UErrorCode status = U_ZERO_ERROR;
    for(;;){
        /* pure pre-flight */
        reqLen=unorm_normalize(src,srcLen,mode, options,NULL,0,&status);
        if(status==U_BUFFER_OVERFLOW_ERROR){
            status=U_ZERO_ERROR;
            dest->name = new UChar[reqLen+1];
            reqLen= unorm_normalize(src,srcLen,mode, options,dest->name,reqLen+1,&status);
            dest->len=reqLen;
            break;
        }else if(U_FAILURE(status)){
            printf("Could not normalize input. Error: %s", u_errorName(status));
        }
    }
}
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, handling);
#elif USE(JAVA_UNICODE)
    String normalized = TextNormalizer::normalize(
                                     characters, length, TextNormalizer::NFC);
    return newTextCodec(*this)->encode(
                 normalized.characters(), normalized.length(), handling);
#elif OS(WINDOWS) && USE(WCHAR_UNICODE)
    // normalization will be done by Windows CE API
    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
#endif
}
void normalizeCharactersIntoNFCForm(const UChar* characters, unsigned length, Vector<UChar>& buffer)
{
    ASSERT(length);

    buffer.resize(length);

    UErrorCode status = U_ZERO_ERROR;
    size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
    ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
    ASSERT(bufferSize);

    buffer.resize(bufferSize);

    if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
        return;

    status = U_ZERO_ERROR;
    unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
    ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
}
Beispiel #14
0
 virtual void call(UErrorCode* pErrorCode) {
     UErrorCode errorCode=U_ZERO_ERROR;
     int32_t destLength=unorm_normalize(testcase.getBuffer(), testcase.getBufferLen(),
                                        UNORM_NFC, 0,
                                        dest, destCapacity,
                                        &errorCode);
     if(U_FAILURE(errorCode) || destLength!=destCapacity) {
         fprintf(stderr, "error: unorm_normalize(UNORM_NFC) failed: %s\n",
                 u_errorName(errorCode));
     }
 }
Beispiel #15
0
NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
    fNormalizedText = NULL;
    fNormalizedTextLength = 0;
    fOriginalText = text;
    if (U_FAILURE(status)) {
        return;
    }
    fNormalizedText = fSmallBuf;
    fNormalizedTextLength = unorm_normalize(
        text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
    if (status == U_BUFFER_OVERFLOW_ERROR) {
        status = U_ZERO_ERROR;
        fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
        if (fNormalizedText == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
        } else {
            fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
                                        fNormalizedText, fNormalizedTextLength+1, &status);
        }
    }
}
Beispiel #16
0
static const SimpleFontData* fontDataForCombiningCharacterSequence(const Font* font, const UChar* characters, size_t length)
{
    UErrorCode error = U_ZERO_ERROR;
    Vector<UChar, 4> normalizedCharacters(length);
    int32_t normalizedLength = unorm_normalize(characters, length, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], length, &error);
    // Should fallback if we have an error or no composition occurred.
    if (U_FAILURE(error) || (static_cast<size_t>(normalizedLength) == length))
        return 0;
    UChar32 normalizedCharacter;
    size_t index = 0;
    U16_NEXT(&normalizedCharacters[0], index, static_cast<size_t>(normalizedLength), normalizedCharacter);
    return font->glyphDataForCharacter(normalizedCharacter, false).fontData;
}
CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (string.isEmpty())
        return "";

    // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
    // unaffected by NFC. This is effectively the same as saying that all
    // Latin-1 text is already normalized to NFC.
    // Source: http://unicode.org/reports/tr15/
    if (string.is8Bit())
        return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);

    const UChar* source = string.characters16();
    size_t length = string.length();

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(length);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        length = normalizedLength;
    }

    return newTextCodec(*this)->encode(source, length, handling);
}
CString TextEncoding::encode(StringView text, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (text.isEmpty())
        return "";

    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    auto upconvertedCharacters = text.upconvertedCharacters();

    const UChar* source = upconvertedCharacters;
    size_t sourceLength = text.length();

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), sourceLength, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }

    return newTextCodec(*this)->encode(source, sourceLength, handling);
}
int helper_normalize_str(const char *src, char *dest, int dest_size)
{
	int type = CTS_LANG_OTHERS;
	int32_t size;
	UErrorCode status = 0;
	UChar tmp_result[CTS_SQL_MAX_LEN*2];
	UChar result[CTS_SQL_MAX_LEN*2];
	int i = 0;
	int j = 0;
	int str_len = strlen(src);
	int char_len = 0;

	for (i=0;i<str_len;i+=char_len) {
		char char_src[10];
		char_len = check_utf8(src[i]);
		memcpy(char_src, &src[i], char_len);
		char_src[char_len] = '\0';

		u_strFromUTF8(tmp_result, array_sizeof(tmp_result), NULL, char_src, -1, &status);
		h_retvm_if(U_FAILURE(status), CTS_ERR_ICU_FAILED,
				"u_strFromUTF8() Failed(%s)", u_errorName(status));

		u_strToLower(tmp_result, array_sizeof(tmp_result), tmp_result, -1, NULL, &status);
		h_retvm_if(U_FAILURE(status), CTS_ERR_ICU_FAILED,
				"u_strToLower() Failed(%s)", u_errorName(status));

		size = unorm_normalize(tmp_result, -1, UNORM_NFD, 0,
				(UChar *)result, array_sizeof(result), &status);
		h_retvm_if(U_FAILURE(status), CTS_ERR_ICU_FAILED,
				"unorm_normalize(%s) Failed(%s)", char_src, u_errorName(status));

		if (0 == i)
			type = helper_check_language(result);
		helper_extra_normalize(result, size);

		u_strToUTF8(&dest[j], dest_size-j, &size, result, -1, &status);
		h_retvm_if(U_FAILURE(status), CTS_ERR_ICU_FAILED,
				"u_strToUTF8() Failed(%s)", u_errorName(status));
		j += size;
		dest[j++] = 0x01;
	}
	dest[j]='\0';
	HELPER_DBG("src(%s) is transformed(%s)", src, dest);
	return type;
}
Beispiel #20
0
wchar_t
getBaseCharacter (wchar_t character) {
#ifdef HAVE_ICU
  UChar source[] = {character};
  const unsigned int resultLength = 0X10;
  UChar resultBuffer[resultLength];
  UErrorCode error = U_ZERO_ERROR;

  unorm_normalize(source, ARRAY_COUNT(source),
                  UNORM_NFD, 0,
                  resultBuffer, resultLength,
                  &error);

  if (U_SUCCESS(error)) return resultBuffer[0];
#endif /* HAVE_ICU */

  return 0;
}
Beispiel #21
0
UChar32 SurrogatePairAwareTextIterator::normalizeVoicingMarks()
{
    // According to http://www.unicode.org/Public/UNIDATA/UCD.html#Canonical_Combining_Class_Values
    static const uint8_t hiraganaKatakanaVoicingMarksCombiningClass = 8;

    if (m_currentCharacter + 1 >= m_endCharacter)
        return 0;

    if (u_getCombiningClass(m_characters[1]) == hiraganaKatakanaVoicingMarksCombiningClass) {
        // Normalize into composed form using 3.2 rules.
        UChar normalizedCharacters[2] = { 0, 0 };
        UErrorCode uStatus = U_ZERO_ERROR;  
        int32_t resultLength = unorm_normalize(m_characters, 2, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], 2, &uStatus);
        if (resultLength == 1 && !uStatus)
            return normalizedCharacters[0];
    }

    return 0;
}
Beispiel #22
0
UChar32 WidthIterator::normalizeVoicingMarks(int currentCharacter)
{
    if (currentCharacter + 1 < m_end) {
        if (combiningClass(m_run[currentCharacter + 1]) == hiraganaKatakanaVoicingMarksCombiningClass) {
#if USE(ICU_UNICODE)
            // Normalize into composed form using 3.2 rules.
            UChar normalizedCharacters[2] = { 0, 0 };
            UErrorCode uStatus = U_ZERO_ERROR;  
            int32_t resultLength = unorm_normalize(m_run.data(currentCharacter), 2,
                UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], 2, &uStatus);
            if (resultLength == 1 && uStatus == 0)
                return normalizedCharacters[0];
#elif USE(QT4_UNICODE)
            QString tmp(reinterpret_cast<const QChar*>(m_run.data(currentCharacter)), 2);
            QString res = tmp.normalized(QString::NormalizationForm_C, QChar::Unicode_3_2);
            if (res.length() == 1)
                return res.at(0).unicode();
#endif
        }
    }
    return 0;
}
Beispiel #23
0
// normalize {{{
static PyObject *
icu_normalize(PyObject *self, PyObject *args) {
    UErrorCode status = U_ZERO_ERROR;
    int32_t sz = 0, mode = UNORM_DEFAULT, cap = 0, rsz = 0;
    UChar *dest = NULL, *source = NULL;
    PyObject *ret = NULL, *src = NULL;
  
    if (!PyArg_ParseTuple(args, "iO", &mode, &src)) return NULL;
    source = python_to_icu(src, &sz, 1);
    if (source == NULL) goto end; 
    cap = 2 * sz;
    dest = (UChar*) calloc(cap, sizeof(UChar));
    if (dest == NULL) { PyErr_NoMemory(); goto end; }

    while (1) {
        rsz = unorm_normalize(source, sz, (UNormalizationMode)mode, 0, dest, cap, &status);
        if (status == U_BUFFER_OVERFLOW_ERROR) {
            cap *= 2;
            dest = (UChar*) realloc(dest, cap*sizeof(UChar));
            if (dest == NULL) { PyErr_NoMemory(); goto end; }
            continue;
        }
        break;
    }

    if (U_FAILURE(status)) {
        PyErr_SetString(PyExc_ValueError, u_errorName(status));
        goto end;
    }
 
    ret = icu_to_python(dest, rsz);

end:
    if (source != NULL) free(source);
    if (dest != NULL) free(dest);
    return ret;
} // }}}
Beispiel #24
0
char UTF8NFKD::processText(SWBuf &text, const SWKey *key, const SWModule *module)
{
	if ((unsigned long)key < 2)	// hack, we're en(1)/de(0)ciphering
		return -1;
        
	int32_t len =  5 + text.length() * 5;
        source = new UChar[len + 1]; //each char could become a surrogate pair

	// Convert UTF-8 string to UTF-16 (UChars)
        int32_t ulen = ucnv_toUChars(conv, source, len, text.c_str(), -1, &err);
        target = new UChar[len + 1];

        //compatability decomposition
        ulen = unorm_normalize(source, ulen, UNORM_NFKD, 0, target, len, &err);

	   text.setSize(len);
	   len = ucnv_fromUChars(conv, text.getRawData(), len, target, ulen, &err);
	   text.setSize(len);

	   delete [] source;
	   delete [] target;

	return 0;
}
Beispiel #25
0
static inline void 
do_norm( 
    ErlNifBinary  in,
    ErlNifBinary& out, 
    int32_t& ulen,
    UNormalizationMode mode,
    UErrorCode& status) 
{
    status = U_ZERO_ERROR;
    if (!enif_alloc_binary(FROM_ULEN(ulen), &out)) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    /* set a new ulen */
    ulen = unorm_normalize(
        (const UChar *) in.data,
        TO_ULEN(in.size),
        mode,
        0,
        (UChar *) out.data,
        ulen,
        &status);

    if (U_FAILURE(status)) {
        /* release the memory in one place */
        enif_release_binary(&out);
        return;
    }

    if (FROM_ULEN(ulen) != out.size) {
        /* shrink binary if it was too large */
        enif_realloc_binary(&out, FROM_ULEN(ulen));
    }

}
Beispiel #26
0
static void TestJB1401(void)
{
    UCollator     *myCollator = 0;
    UErrorCode     status = U_ZERO_ERROR;
    static UChar   NFD_UnsafeStartChars[] = {
        0x0f73,          /* Tibetan Vowel Sign II */
        0x0f75,          /* Tibetan Vowel Sign UU */
        0x0f81,          /* Tibetan Vowel Sign Reversed II */
            0
    };
    int            i;

    
    myCollator = ucol_open("en_US", &status);
    if (U_FAILURE(status)){
        int32_t     bufferLen   = 0;
        UChar       dispName    [100]; 
        bufferLen = uloc_getDisplayName("en_US", 0, dispName, 100, &status);
        /*Report the error with display name... */
        log_err("ERROR: Failed to create the collator for : \"%s\"\n", dispName);
        return;
    }
    ucol_setAttribute(myCollator, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
    if (U_FAILURE(status)){
        log_err("ERROR: Failed to set normalization mode ON for collator.\n");
        return;
    }

    for (i=0; ; i++) {
        UChar    c;
        UChar    X[4];
        UChar    Y[20];
        UChar    Z[20];

        /*  Get the next funny character to be tested, and set up the
         *  three test strings X, Y, Z, consisting of an A-grave + test char,
         *    in original form, NFD, and then NFC form.
         */
        c = NFD_UnsafeStartChars[i];
        if (c==0) {break;}

        X[0]=0xC0; X[1]=c; X[2]=0;   /* \u00C0 is A Grave*/
        
        unorm_normalize(X, -1, UNORM_NFD, 0, Y, 20, &status);
        unorm_normalize(Y, -1, UNORM_NFC, 0, Z, 20, &status);
        if (U_FAILURE(status)){
            log_err("ERROR: Failed to normalize test of character %x\n", c);
            return;
        }

        /* Collation test.  All three strings should be equal.
         *   doTest does both strcoll and sort keys, with params in both orders.
         */
        doTest(myCollator, X, Y, UCOL_EQUAL);
        doTest(myCollator, X, Z, UCOL_EQUAL);
        doTest(myCollator, Y, Z, UCOL_EQUAL);

        /* Run collation element iterators over the three strings.  Results should be same for each.
         */
        {
            UCollationElements *ceiX, *ceiY, *ceiZ;
            int32_t             ceX,   ceY,   ceZ;
            int                 j;

            ceiX = ucol_openElements(myCollator, X, -1, &status);
            ceiY = ucol_openElements(myCollator, Y, -1, &status);
            ceiZ = ucol_openElements(myCollator, Z, -1, &status);
            if (U_FAILURE(status)) {
                log_err("ERROR: uucol_openElements failed.\n");
                return;
            }

            for (j=0;; j++) {
                ceX = ucol_next(ceiX, &status);
                ceY = ucol_next(ceiY, &status);
                ceZ = ucol_next(ceiZ, &status);
                if (U_FAILURE(status)) {
                    log_err("ERROR: ucol_next failed for iteration #%d.\n", j);
                    break;
                }
                if (ceX != ceY || ceY != ceZ) {
                    log_err("ERROR: ucol_next failed for iteration #%d.\n", j);
                    break;
                }
                if (ceX == UCOL_NULLORDER) {
                    break;
                }
            }
            ucol_closeElements(ceiX);
            ucol_closeElements(ceiY);
            ucol_closeElements(ceiZ);
        }
    }
    ucol_close(myCollator);
}
Beispiel #27
0
DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities)
{
    TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID();

    if (encoding == WinLatin1Encoding && qcs.isAllLatin1())
        return qcs.latin1();

    if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) 
        && qcs.isAllASCII())
        return qcs.ascii();

    // FIXME: We should see if there is "force ASCII range" mode in ICU;
    // until then, we change the backslash into a yen sign.
    // Encoding will change the yen sign back into a backslash.
    DeprecatedString copy = qcs;
    copy.replace('\\', m_encoding.backslashAsCurrencySymbol());

    if (!m_converterICU)
        createICUConverter();
    if (!m_converterICU)
        return DeprecatedCString();

    // FIXME: when DeprecatedString buffer is latin1, it would be nice to
    // convert from that w/o having to allocate a unicode buffer

    char buffer[ConversionBufferSize];
    const UChar* source = reinterpret_cast<const UChar*>(copy.unicode());
    const UChar* sourceLimit = source + copy.length();

    UErrorCode err = U_ZERO_ERROR;
    DeprecatedString normalizedString;
    if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) {
        normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed
        
        int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedString.truncate(normalizedLength);
            normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err);
        }
        
        source = reinterpret_cast<const UChar*>(normalizedString.unicode());
        sourceLimit = source + normalizedLength;
    }

    DeprecatedCString result(1); // for trailing zero

    if (allowEntities)
        ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
    else {
        ucnv_setSubstChars(m_converterICU, "?", 1, &err);
        ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
    }

    ASSERT(U_SUCCESS(err));
    if (U_FAILURE(err))
        return DeprecatedCString();

    do {
        char* target = buffer;
        char* targetLimit = target + ConversionBufferSize;
        err = U_ZERO_ERROR;
        ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true,  &err);
        int count = target - buffer;
        buffer[count] = 0;
        result.append(buffer);
    } while (err == U_BUFFER_OVERFLOW_ERROR);

    return result;
}
Beispiel #28
0
Variant c_Normalizer::ti_normalize(const char* cls , CStrRef input,
                                   int64 form /* = q_Normalizer___FORM_C */) {
  STATIC_METHOD_INJECTION_BUILTIN(Normalizer, Normalizer::normalize);
  s_intl_error->m_error.clear();

  int expansion_factor = 1;
  switch(form) {
  case UNORM_NONE:
  case UNORM_NFC:
  case UNORM_NFKC:
    break;
  case UNORM_NFD:
  case UNORM_NFKD:
    expansion_factor = 3;
    break;
  default:
    s_intl_error->m_error.code = U_ILLEGAL_ARGUMENT_ERROR;
    s_intl_error->m_error.custom_error_message =
      "normalizer_normalize: illegal normalization form";
    return null;
  }

  /* First convert the string to UTF-16. */
  UChar* uinput = NULL; int uinput_len = 0;
  UErrorCode status = U_ZERO_ERROR;
  intl_convert_utf8_to_utf16(&uinput, &uinput_len, input.data(), input.size(),
                             &status);

  if (U_FAILURE(status)) {
    s_intl_error->m_error.code = status;
    s_intl_error->m_error.custom_error_message =
        "Error converting string to UTF-16.";
    free(uinput);
    return null;
  }

  /* Allocate memory for the destination buffer for normalization */
  int uret_len = uinput_len * expansion_factor;
  UChar *uret_buf = (UChar*)malloc((uret_len + 1) * sizeof(UChar));

  /* normalize */
  int size_needed = unorm_normalize(uinput, uinput_len,
                                    (UNormalizationMode)form, (int32_t) 0,
                                    uret_buf, uret_len, &status);

  /* Bail out if an unexpected error occured.
   * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
   * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string
   * is empty).
   */
  if (U_FAILURE(status) &&
      status != U_BUFFER_OVERFLOW_ERROR &&
      status != U_STRING_NOT_TERMINATED_WARNING) {
    free(uret_buf);
    free(uinput);
    return null;
  }

  if (size_needed > uret_len) {
    /* realloc does not seem to work properly - memory is corrupted
     * uret_buf =  eurealloc(uret_buf, size_needed + 1); */
    free(uret_buf);
    uret_buf = (UChar*)malloc((size_needed + 1) * sizeof(UChar));
    uret_len = size_needed;

    status = U_ZERO_ERROR;

    /* try normalize again */
    size_needed = unorm_normalize( uinput, uinput_len,
                                   (UNormalizationMode)form, (int32_t) 0,
                                   uret_buf, uret_len, &status);

    /* Bail out if an unexpected error occured. */
    if (U_FAILURE(status)) {
      /* Set error messages. */
      s_intl_error->m_error.code = status;
      s_intl_error->m_error.custom_error_message = "Error normalizing string";
      free(uret_buf);
      free(uinput);
      return null;
    }
  }

  free(uinput);

  /* the buffer we actually used */
  uret_len = size_needed;

  /* Convert normalized string from UTF-16 to UTF-8. */
  char* ret_buf = NULL; int ret_len = 0;
  intl_convert_utf16_to_utf8(&ret_buf, &ret_len, uret_buf, uret_len, &status);
  free(uret_buf);
  if (U_FAILURE(status)) {
    s_intl_error->m_error.code = status;
    s_intl_error->m_error.custom_error_message =
      "normalizer_normalize: error converting normalized text UTF-8";
    return null;
  }

  return String(ret_buf, ret_len, AttachString);
}
Beispiel #29
0
/**
 * @param mode same with ICU mode flag UNormalizationMode.
 */
size_t uni_normalize(char* src, size_t src_len, char* dst, size_t dst_capacity, int mode, int opt){
    UNormalizationMode umode = (UNormalizationMode)mode;
    // status holder
    UErrorCode ustatus = U_ZERO_ERROR;
    // UChar source
    UChar *s;
    int32_t s_length, s_capacity;
    // UChar normalized
    UChar *d;
    int32_t d_length, d_capacity;
    // UTF8 normalzied
    int32_t dst_alloc;
	
    // convert UTF-8 -> UChar
	u_strFromUTF8(NULL, 0, &s_length, src, (int32_t)src_len, &ustatus);
	if(U_FAILURE(ustatus) && ustatus!=U_BUFFER_OVERFLOW_ERROR){
		char buf[1024];
		sprintf(buf,"ICU u_strFromUTF8(pre-flighting) error with %d\n", ustatus);
		fputs(buf, stderr); fflush(stderr);
		return 0;
	}else{
	    ustatus = U_ZERO_ERROR;
	}
	s_capacity = (s_length+7)/8*8; // for '\0' termination
    s = (UChar*)my_malloc(s_capacity*sizeof(UChar), MYF(MY_WME));
    if(!s){
		fputs("malloc failure\n", stderr); fflush(stderr);
		return 0;
	}
    s = u_strFromUTF8(s, s_length, NULL, src, (int32_t)src_len, &ustatus);
    if(U_FAILURE(ustatus)){
		char buf[1024];
		sprintf(buf,"ICU u_strFromUTF8 error with %d\n", ustatus);
		fputs(buf, stderr); fflush(stderr);
		my_free(s);
		return 0;
	}else{
	    ustatus = U_ZERO_ERROR;
	}
	
    // normalize
	d_length = unorm_normalize(s, s_length, umode, (int32_t)opt, NULL, 0, &ustatus);
	if(U_FAILURE(ustatus) && ustatus!=U_BUFFER_OVERFLOW_ERROR){
		char buf[1024];
		sprintf(buf,"ICU unorm_normalize(pre-flighting) error with %d\n", ustatus);
		fputs(buf, stderr); fflush(stderr);
		my_free(s);
		return 0;
	}else{
	    ustatus = U_ZERO_ERROR;
	}
	d_capacity = (d_length+7)/8*8;
    d = (UChar*)my_malloc(d_capacity*sizeof(UChar), MYF(MY_WME));
    if(!d){
		fputs("malloc failure\n", stderr); fflush(stderr);
		my_free(s);
		return 0;
    }
	d_length = unorm_normalize(s, s_length, umode, (int32_t)opt, d, d_capacity, &ustatus);
	if(U_FAILURE(ustatus)){
		char buf[1024];
		sprintf(buf,"ICU unorm_normalize error with %d\n", ustatus);
		fputs(buf, stderr); fflush(stderr);
		my_free(s);
		my_free(d);
		return 0;
	}else{
	    ustatus = U_ZERO_ERROR;
	}
	my_free(s);
	
    // encode UChar -> UTF-8
    u_strToUTF8(dst, (int32_t)dst_capacity, &dst_alloc, d, d_length, &ustatus);
    my_free(d);
	return (size_t)dst_alloc;
}
Beispiel #30
0
U_CAPI int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker *sc,
                   uint32_t type,
                   const UChar *s,  int32_t length,
                   UChar *dest, int32_t destCapacity,
                   UErrorCode *status) {

    // TODO:  this function could be sped up a bit
    //        Skip the input normalization when not needed, work from callers data.
    //        Put the initial skeleton straight into the caller's destination buffer.
    //        It probably won't need normalization.
    //        But these would make the structure more complicated.  

    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (U_FAILURE(*status)) {
        return 0;
    }
    if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
        (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

   int32_t tableMask = 0;
   switch (type) {
      case 0:
        tableMask = USPOOF_ML_TABLE_FLAG;
        break;
      case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
        tableMask = USPOOF_SL_TABLE_FLAG;
        break;
      case USPOOF_ANY_CASE:
        tableMask = USPOOF_MA_TABLE_FLAG;
        break;
      case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
        tableMask = USPOOF_SA_TABLE_FLAG;
        break;
      default:
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    // NFD transform of the user supplied input
    
    UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
    UChar *nfdInput = nfdStackBuf;
    int32_t normalizedLen = unorm_normalize(
        s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
    if (*status == U_BUFFER_OVERFLOW_ERROR) {
        nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
        if (nfdInput == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return 0;
        }
        *status = U_ZERO_ERROR;
        normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
                                        nfdInput, normalizedLen+1, status);
    }
    if (U_FAILURE(*status)) {
        if (nfdInput != nfdStackBuf) {
            uprv_free(nfdInput);
        }
        return 0;
    }

    // buffer to hold the Unicode defined skeleton mappings for a single code point
    UChar buf[USPOOF_MAX_SKELETON_EXPANSION];

    // Apply the skeleton mapping to the NFD normalized input string
    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
    int32_t inputIndex = 0;
    UnicodeString skelStr;
    while (inputIndex < normalizedLen) {
        UChar32 c;
        U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
        int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
        skelStr.append(buf, replaceLen);
    }

    if (nfdInput != nfdStackBuf) {
        uprv_free(nfdInput);
    }
    
    const UChar *result = skelStr.getBuffer();
    int32_t  resultLen  = skelStr.length();
    UChar   *normedResult = NULL;

    // Check the skeleton for NFD, normalize it if needed.
    // Unnormalized results should be very rare.
    if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
        normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
        normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
        if (normedResult == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return 0;
        }
        *status = U_ZERO_ERROR;
        unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
        result = normedResult;
        resultLen = normalizedLen;
    }

    // Copy the skeleton to the caller's buffer
    if (U_SUCCESS(*status)) {
        if (destCapacity == 0 || resultLen > destCapacity) {
            *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
        } else {
            u_memcpy(dest, result, resultLen);
            if (destCapacity > resultLen) {
                dest[resultLen] = 0;
            } else {
                *status = U_STRING_NOT_TERMINATED_WARNING;
            }
        }
     }       
     uprv_free(normedResult);
     return resultLen;
}