static int icu_normalizer_normalize(lua_State *L) { const UChar* source = icu4lua_checkustring(L, 1, NORMALIZER_UV_USTRING_META); int32_t sourceLength = (int32_t)icu4lua_ustrlen(L, 1); UNormalizationMode mode; UChar* result; int32_t resultLength; UErrorCode status; int32_t options; mode = modes[luaL_checkoption(L, 2, DEFAULT_MODE_OPTION, modeNames)]; options = (int32_t)luaL_optnumber(L,3,0); status = U_ZERO_ERROR; resultLength = unorm_normalize(source, sourceLength, mode, options, NULL, 0, &status); if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) { lua_pushnil(L); lua_pushstring(L, u_errorName(status)); return 2; } result = (UChar*)malloc(sizeof(UChar) * resultLength); status = U_ZERO_ERROR; unorm_normalize(source, sourceLength, mode, options, result, resultLength, &status); if (U_FAILURE(status)) { free(result); lua_pushnil(L); lua_pushstring(L, u_errorName(status)); return 2; } icu4lua_pushustring(L, result, resultLength, NORMALIZER_UV_USTRING_META, NORMALIZER_UV_USTRING_POOL); free(result); return 1; }
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(QT4_UNICODE) QString str(reinterpret_cast<const QChar*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #elif USE(GLIB_UNICODE) GOwnPtr<char> UTF8Source; UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0)); if (!UTF8Source) { // If conversion to UTF-8 failed, try with the string without normalization return newTextCodec(*this)->encode(characters, length, handling); } GOwnPtr<char> UTF8Normalized; UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC)); long UTF16Length; GOwnPtr<UChar> UTF16Normalized; UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0)); return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling); #elif OS(WINCE) // normalization will be done by Windows CE API OwnPtr<TextCodec> textCodec = newTextCodec(*this); return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); #endif }
void utf16_normalize(UChar **target, int32_t *target_len, const UChar *src, int32_t src_len, UNormalizationMode nm, UErrorCode *status) { *status = U_ZERO_ERROR; if (nm < UNORM_NONE || nm >= UNORM_MODE_COUNT) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (UNORM_NONE == nm) { *target = (UChar *) src; *target_len = src_len; return; } *target_len = unorm_normalize(src, src_len, nm, 0, NULL, 0, status); if (U_BUFFER_OVERFLOW_ERROR != *status) { return; } *status = U_ZERO_ERROR; *target = mem_new_n(**target, *target_len + 1); /* *target_len = */unorm_normalize(src, src_len, nm, 0, *target, *target_len + 1, status); if (U_FAILURE(*status)) { efree(*target); *target = NULL; *target_len = 0; } else { *(*target + *target_len) = '\0'; assert(U_ZERO_ERROR == *status); } }
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(QT4_UNICODE) QString str(reinterpret_cast<const QChar*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #elif USE(CASQT_UNICODE) // FIXME:CASQT #if 0 // 暂不执行normalized,该函数效率太低 QString str(reinterpret_cast<const ushort*>(characters), length); str = str.normalized(QString::NormalizationForm_C); return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); #else return newTextCodec(*this)->encode(characters, length, handling); #endif #endif }
bool SimpleFontData::canRenderCombiningCharacterSequence( const UChar* characters, size_t length) const { if (!m_combiningCharacterSequenceSupport) m_combiningCharacterSequenceSupport = adoptPtr(new HashMap<String, bool>); WTF::HashMap<String, bool>::AddResult addResult = m_combiningCharacterSequenceSupport->add(String(characters, length), false); if (!addResult.isNewEntry) return addResult.storedValue->value; UErrorCode error = U_ZERO_ERROR; Vector<UChar, 4> normalizedCharacters(length); int32_t normalizedLength = unorm_normalize(characters, length, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], length, &error); // Can't render if we have an error or no composition occurred. if (U_FAILURE(error) || (static_cast<size_t>(normalizedLength) == length)) return false; SkPaint paint; m_platformData.setupPaint(&paint); paint.setTextEncoding(SkPaint::kUTF16_TextEncoding); if (paint.textToGlyphs(&normalizedCharacters[0], normalizedLength * 2, 0)) { addResult.storedValue->value = true; return true; } return false; }
UChar32 SurrogatePairAwareTextIterator::normalizeVoicingMarks() { // According to http://www.unicode.org/Public/UNIDATA/UCD.html#Canonical_Combining_Class_Values static const uint8_t hiraganaKatakanaVoicingMarksCombiningClass = 8; if (m_currentCharacter + 1 >= m_endCharacter) return 0; if (combiningClass(m_characters[1]) == hiraganaKatakanaVoicingMarksCombiningClass) { #if USE(ICU_UNICODE) // Normalize into composed form using 3.2 rules. UChar normalizedCharacters[2] = { 0, 0 }; UErrorCode uStatus = U_ZERO_ERROR; int32_t resultLength = unorm_normalize(m_characters, 2, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], 2, &uStatus); if (resultLength == 1 && !uStatus) return normalizedCharacters[0]; #elif USE(QT4_UNICODE) QString tmp(reinterpret_cast<const QChar*>(m_characters), 2); QString res = tmp.normalized(QString::NormalizationForm_C, QChar::Unicode_3_2); if (res.length() == 1) return res.at(0).unicode(); #endif } return 0; }
bool SimpleFontData::canRenderCombiningCharacterSequence(const UChar* characters, size_t length) const { if (!m_combiningCharacterSequenceSupport) m_combiningCharacterSequenceSupport = adoptPtr(new HashMap<String, bool>); WTF::HashMap<String, bool>::AddResult addResult = m_combiningCharacterSequenceSupport->add(String(characters, length), false); if (!addResult.isNewEntry) return addResult.iterator->value; UErrorCode error = U_ZERO_ERROR; Vector<UChar, 4> normalizedCharacters(length); int32_t normalizedLength = unorm_normalize(characters, length, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], length, &error); // Can't render if we have an error or no composition occurred. if (U_FAILURE(error) || (static_cast<size_t>(normalizedLength) == length)) return false; FT_Face face = cairo_ft_scaled_font_lock_face(m_platformData.scaledFont()); if (!face) return false; if (FcFreeTypeCharIndex(face, normalizedCharacters[0])) addResult.iterator->value = true; cairo_ft_scaled_font_unlock_face(m_platformData.scaledFont()); return addResult.iterator->value; }
bool SimpleFontData::canRenderCombiningCharacterSequence(const UChar* characters, size_t length) const { if (!m_combiningCharacterSequenceSupport) m_combiningCharacterSequenceSupport = adoptPtr(new HashMap<String, bool>); WTF::HashMap<String, bool>::AddResult addResult = m_combiningCharacterSequenceSupport->add(String(characters, length), false); if (!addResult.isNewEntry) return addResult.iterator->value; UErrorCode error = U_ZERO_ERROR; Vector<UChar, 4> normalizedCharacters(length); int32_t normalizedLength = unorm_normalize(characters, length, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], length, &error); if (U_FAILURE(error)) return false; int position = 0; while (position < normalizedLength) { UChar32 character; int nextPosition = position; U16_NEXT(normalizedCharacters, nextPosition, normalizedLength, character); if (!u_hasBinaryProperty(character, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) { FS_USHORT glyph = FS_map_char(m_platformData.font(), static_cast<FS_ULONG>(character)); if (!glyph) return false; } position = nextPosition; } addResult.iterator->value = true; return true; }
int32_t __hs_unorm_normalize(const UChar *source, int32_t sourceLength, UNormalizationMode mode, int32_t options, UChar *result, int32_t resultLength, UErrorCode *status) { return unorm_normalize(source, sourceLength, mode, options, result, resultLength, status); }
ToNFC(const UTrie2PerfTest &testcase) : Command(testcase) { UErrorCode errorCode=U_ZERO_ERROR; destCapacity=unorm_normalize(testcase.getBuffer(), testcase.getBufferLen(), UNORM_NFC, 0, NULL, 0, &errorCode); dest=new UChar[destCapacity]; }
void NormalizerPerformanceTest::normalizeInput(ULine* dest,const UChar* src ,int32_t srcLen,UNormalizationMode mode, int32_t options){ int32_t reqLen = 0; UErrorCode status = U_ZERO_ERROR; for(;;){ /* pure pre-flight */ reqLen=unorm_normalize(src,srcLen,mode, options,NULL,0,&status); if(status==U_BUFFER_OVERFLOW_ERROR){ status=U_ZERO_ERROR; dest->name = new UChar[reqLen+1]; reqLen= unorm_normalize(src,srcLen,mode, options,dest->name,reqLen+1,&status); dest->len=reqLen; break; }else if(U_FAILURE(status)){ printf("Could not normalize input. Error: %s", u_errorName(status)); } } }
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const { if (!m_name) return CString(); if (!length) return ""; #if USE(ICU_UNICODE) // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. const UChar* source = characters; size_t sourceLength = length; Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); #elif USE(JAVA_UNICODE) String normalized = TextNormalizer::normalize( characters, length, TextNormalizer::NFC); return newTextCodec(*this)->encode( normalized.characters(), normalized.length(), handling); #elif OS(WINDOWS) && USE(WCHAR_UNICODE) // normalization will be done by Windows CE API OwnPtr<TextCodec> textCodec = newTextCodec(*this); return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); #endif }
void normalizeCharactersIntoNFCForm(const UChar* characters, unsigned length, Vector<UChar>& buffer) { ASSERT(length); buffer.resize(length); UErrorCode status = U_ZERO_ERROR; size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status); ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR); ASSERT(bufferSize); buffer.resize(bufferSize); if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) return; status = U_ZERO_ERROR; unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status); ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); }
virtual void call(UErrorCode* pErrorCode) { UErrorCode errorCode=U_ZERO_ERROR; int32_t destLength=unorm_normalize(testcase.getBuffer(), testcase.getBufferLen(), UNORM_NFC, 0, dest, destCapacity, &errorCode); if(U_FAILURE(errorCode) || destLength!=destCapacity) { fprintf(stderr, "error: unorm_normalize(UNORM_NFC) failed: %s\n", u_errorName(errorCode)); } }
NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) { fNormalizedText = NULL; fNormalizedTextLength = 0; fOriginalText = text; if (U_FAILURE(status)) { return; } fNormalizedText = fSmallBuf; fNormalizedTextLength = unorm_normalize( text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { status = U_ZERO_ERROR; fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar)); if (fNormalizedText == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } else { fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0, fNormalizedText, fNormalizedTextLength+1, &status); } } }
static const SimpleFontData* fontDataForCombiningCharacterSequence(const Font* font, const UChar* characters, size_t length) { UErrorCode error = U_ZERO_ERROR; Vector<UChar, 4> normalizedCharacters(length); int32_t normalizedLength = unorm_normalize(characters, length, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], length, &error); // Should fallback if we have an error or no composition occurred. if (U_FAILURE(error) || (static_cast<size_t>(normalizedLength) == length)) return 0; UChar32 normalizedCharacter; size_t index = 0; U16_NEXT(&normalizedCharacters[0], index, static_cast<size_t>(normalizedLength), normalizedCharacter); return font->glyphDataForCharacter(normalizedCharacter, false).fontData; }
CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const { if (!m_name) return CString(); if (string.isEmpty()) return ""; // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left // unaffected by NFC. This is effectively the same as saying that all // Latin-1 text is already normalized to NFC. // Source: http://unicode.org/reports/tr15/ if (string.is8Bit()) return newTextCodec(*this)->encode(string.characters8(), string.length(), handling); const UChar* source = string.characters16(); size_t length = string.length(); Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(length); int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); length = normalizedLength; } return newTextCodec(*this)->encode(source, length, handling); }
CString TextEncoding::encode(StringView text, UnencodableHandling handling) const { if (!m_name) return CString(); if (text.isEmpty()) return ""; // FIXME: What's the right place to do normalization? // It's a little strange to do it inside the encode function. // Perhaps normalization should be an explicit step done before calling encode. auto upconvertedCharacters = text.upconvertedCharacters(); const UChar* source = upconvertedCharacters; size_t sourceLength = text.length(); Vector<UChar> normalizedCharacters; UErrorCode err = U_ZERO_ERROR; if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { // First try using the length of the original string, since normalization to NFC rarely increases length. normalizedCharacters.grow(sourceLength); int32_t normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), sourceLength, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedCharacters.resize(normalizedLength); normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); } ASSERT(U_SUCCESS(err)); source = normalizedCharacters.data(); sourceLength = normalizedLength; } return newTextCodec(*this)->encode(source, sourceLength, handling); }
int helper_normalize_str(const char *src, char *dest, int dest_size) { int type = CTS_LANG_OTHERS; int32_t size; UErrorCode status = 0; UChar tmp_result[CTS_SQL_MAX_LEN*2]; UChar result[CTS_SQL_MAX_LEN*2]; int i = 0; int j = 0; int str_len = strlen(src); int char_len = 0; for (i=0;i<str_len;i+=char_len) { char char_src[10]; char_len = check_utf8(src[i]); memcpy(char_src, &src[i], char_len); char_src[char_len] = '\0'; u_strFromUTF8(tmp_result, array_sizeof(tmp_result), NULL, char_src, -1, &status); h_retvm_if(U_FAILURE(status), CTS_ERR_ICU_FAILED, "u_strFromUTF8() Failed(%s)", u_errorName(status)); u_strToLower(tmp_result, array_sizeof(tmp_result), tmp_result, -1, NULL, &status); h_retvm_if(U_FAILURE(status), CTS_ERR_ICU_FAILED, "u_strToLower() Failed(%s)", u_errorName(status)); size = unorm_normalize(tmp_result, -1, UNORM_NFD, 0, (UChar *)result, array_sizeof(result), &status); h_retvm_if(U_FAILURE(status), CTS_ERR_ICU_FAILED, "unorm_normalize(%s) Failed(%s)", char_src, u_errorName(status)); if (0 == i) type = helper_check_language(result); helper_extra_normalize(result, size); u_strToUTF8(&dest[j], dest_size-j, &size, result, -1, &status); h_retvm_if(U_FAILURE(status), CTS_ERR_ICU_FAILED, "u_strToUTF8() Failed(%s)", u_errorName(status)); j += size; dest[j++] = 0x01; } dest[j]='\0'; HELPER_DBG("src(%s) is transformed(%s)", src, dest); return type; }
wchar_t getBaseCharacter (wchar_t character) { #ifdef HAVE_ICU UChar source[] = {character}; const unsigned int resultLength = 0X10; UChar resultBuffer[resultLength]; UErrorCode error = U_ZERO_ERROR; unorm_normalize(source, ARRAY_COUNT(source), UNORM_NFD, 0, resultBuffer, resultLength, &error); if (U_SUCCESS(error)) return resultBuffer[0]; #endif /* HAVE_ICU */ return 0; }
UChar32 SurrogatePairAwareTextIterator::normalizeVoicingMarks() { // According to http://www.unicode.org/Public/UNIDATA/UCD.html#Canonical_Combining_Class_Values static const uint8_t hiraganaKatakanaVoicingMarksCombiningClass = 8; if (m_currentCharacter + 1 >= m_endCharacter) return 0; if (u_getCombiningClass(m_characters[1]) == hiraganaKatakanaVoicingMarksCombiningClass) { // Normalize into composed form using 3.2 rules. UChar normalizedCharacters[2] = { 0, 0 }; UErrorCode uStatus = U_ZERO_ERROR; int32_t resultLength = unorm_normalize(m_characters, 2, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], 2, &uStatus); if (resultLength == 1 && !uStatus) return normalizedCharacters[0]; } return 0; }
UChar32 WidthIterator::normalizeVoicingMarks(int currentCharacter) { if (currentCharacter + 1 < m_end) { if (combiningClass(m_run[currentCharacter + 1]) == hiraganaKatakanaVoicingMarksCombiningClass) { #if USE(ICU_UNICODE) // Normalize into composed form using 3.2 rules. UChar normalizedCharacters[2] = { 0, 0 }; UErrorCode uStatus = U_ZERO_ERROR; int32_t resultLength = unorm_normalize(m_run.data(currentCharacter), 2, UNORM_NFC, UNORM_UNICODE_3_2, &normalizedCharacters[0], 2, &uStatus); if (resultLength == 1 && uStatus == 0) return normalizedCharacters[0]; #elif USE(QT4_UNICODE) QString tmp(reinterpret_cast<const QChar*>(m_run.data(currentCharacter)), 2); QString res = tmp.normalized(QString::NormalizationForm_C, QChar::Unicode_3_2); if (res.length() == 1) return res.at(0).unicode(); #endif } } return 0; }
// normalize {{{ static PyObject * icu_normalize(PyObject *self, PyObject *args) { UErrorCode status = U_ZERO_ERROR; int32_t sz = 0, mode = UNORM_DEFAULT, cap = 0, rsz = 0; UChar *dest = NULL, *source = NULL; PyObject *ret = NULL, *src = NULL; if (!PyArg_ParseTuple(args, "iO", &mode, &src)) return NULL; source = python_to_icu(src, &sz, 1); if (source == NULL) goto end; cap = 2 * sz; dest = (UChar*) calloc(cap, sizeof(UChar)); if (dest == NULL) { PyErr_NoMemory(); goto end; } while (1) { rsz = unorm_normalize(source, sz, (UNormalizationMode)mode, 0, dest, cap, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { cap *= 2; dest = (UChar*) realloc(dest, cap*sizeof(UChar)); if (dest == NULL) { PyErr_NoMemory(); goto end; } continue; } break; } if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } ret = icu_to_python(dest, rsz); end: if (source != NULL) free(source); if (dest != NULL) free(dest); return ret; } // }}}
char UTF8NFKD::processText(SWBuf &text, const SWKey *key, const SWModule *module) { if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering return -1; int32_t len = 5 + text.length() * 5; source = new UChar[len + 1]; //each char could become a surrogate pair // Convert UTF-8 string to UTF-16 (UChars) int32_t ulen = ucnv_toUChars(conv, source, len, text.c_str(), -1, &err); target = new UChar[len + 1]; //compatability decomposition ulen = unorm_normalize(source, ulen, UNORM_NFKD, 0, target, len, &err); text.setSize(len); len = ucnv_fromUChars(conv, text.getRawData(), len, target, ulen, &err); text.setSize(len); delete [] source; delete [] target; return 0; }
static inline void do_norm( ErlNifBinary in, ErlNifBinary& out, int32_t& ulen, UNormalizationMode mode, UErrorCode& status) { status = U_ZERO_ERROR; if (!enif_alloc_binary(FROM_ULEN(ulen), &out)) { status = U_MEMORY_ALLOCATION_ERROR; return; } /* set a new ulen */ ulen = unorm_normalize( (const UChar *) in.data, TO_ULEN(in.size), mode, 0, (UChar *) out.data, ulen, &status); if (U_FAILURE(status)) { /* release the memory in one place */ enif_release_binary(&out); return; } if (FROM_ULEN(ulen) != out.size) { /* shrink binary if it was too large */ enif_realloc_binary(&out, FROM_ULEN(ulen)); } }
static void TestJB1401(void) { UCollator *myCollator = 0; UErrorCode status = U_ZERO_ERROR; static UChar NFD_UnsafeStartChars[] = { 0x0f73, /* Tibetan Vowel Sign II */ 0x0f75, /* Tibetan Vowel Sign UU */ 0x0f81, /* Tibetan Vowel Sign Reversed II */ 0 }; int i; myCollator = ucol_open("en_US", &status); if (U_FAILURE(status)){ int32_t bufferLen = 0; UChar dispName [100]; bufferLen = uloc_getDisplayName("en_US", 0, dispName, 100, &status); /*Report the error with display name... */ log_err("ERROR: Failed to create the collator for : \"%s\"\n", dispName); return; } ucol_setAttribute(myCollator, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); if (U_FAILURE(status)){ log_err("ERROR: Failed to set normalization mode ON for collator.\n"); return; } for (i=0; ; i++) { UChar c; UChar X[4]; UChar Y[20]; UChar Z[20]; /* Get the next funny character to be tested, and set up the * three test strings X, Y, Z, consisting of an A-grave + test char, * in original form, NFD, and then NFC form. */ c = NFD_UnsafeStartChars[i]; if (c==0) {break;} X[0]=0xC0; X[1]=c; X[2]=0; /* \u00C0 is A Grave*/ unorm_normalize(X, -1, UNORM_NFD, 0, Y, 20, &status); unorm_normalize(Y, -1, UNORM_NFC, 0, Z, 20, &status); if (U_FAILURE(status)){ log_err("ERROR: Failed to normalize test of character %x\n", c); return; } /* Collation test. All three strings should be equal. * doTest does both strcoll and sort keys, with params in both orders. */ doTest(myCollator, X, Y, UCOL_EQUAL); doTest(myCollator, X, Z, UCOL_EQUAL); doTest(myCollator, Y, Z, UCOL_EQUAL); /* Run collation element iterators over the three strings. Results should be same for each. */ { UCollationElements *ceiX, *ceiY, *ceiZ; int32_t ceX, ceY, ceZ; int j; ceiX = ucol_openElements(myCollator, X, -1, &status); ceiY = ucol_openElements(myCollator, Y, -1, &status); ceiZ = ucol_openElements(myCollator, Z, -1, &status); if (U_FAILURE(status)) { log_err("ERROR: uucol_openElements failed.\n"); return; } for (j=0;; j++) { ceX = ucol_next(ceiX, &status); ceY = ucol_next(ceiY, &status); ceZ = ucol_next(ceiZ, &status); if (U_FAILURE(status)) { log_err("ERROR: ucol_next failed for iteration #%d.\n", j); break; } if (ceX != ceY || ceY != ceZ) { log_err("ERROR: ucol_next failed for iteration #%d.\n", j); break; } if (ceX == UCOL_NULLORDER) { break; } } ucol_closeElements(ceiX); ucol_closeElements(ceiY); ucol_closeElements(ceiZ); } } ucol_close(myCollator); }
DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities) { TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID(); if (encoding == WinLatin1Encoding && qcs.isAllLatin1()) return qcs.latin1(); if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) && qcs.isAllASCII()) return qcs.ascii(); // FIXME: We should see if there is "force ASCII range" mode in ICU; // until then, we change the backslash into a yen sign. // Encoding will change the yen sign back into a backslash. DeprecatedString copy = qcs; copy.replace('\\', m_encoding.backslashAsCurrencySymbol()); if (!m_converterICU) createICUConverter(); if (!m_converterICU) return DeprecatedCString(); // FIXME: when DeprecatedString buffer is latin1, it would be nice to // convert from that w/o having to allocate a unicode buffer char buffer[ConversionBufferSize]; const UChar* source = reinterpret_cast<const UChar*>(copy.unicode()); const UChar* sourceLimit = source + copy.length(); UErrorCode err = U_ZERO_ERROR; DeprecatedString normalizedString; if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) { normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedString.truncate(normalizedLength); normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err); } source = reinterpret_cast<const UChar*>(normalizedString.unicode()); sourceLimit = source + normalizedLength; } DeprecatedCString result(1); // for trailing zero if (allowEntities) ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); else { ucnv_setSubstChars(m_converterICU, "?", 1, &err); ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); } ASSERT(U_SUCCESS(err)); if (U_FAILURE(err)) return DeprecatedCString(); do { char* target = buffer; char* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); int count = target - buffer; buffer[count] = 0; result.append(buffer); } while (err == U_BUFFER_OVERFLOW_ERROR); return result; }
Variant c_Normalizer::ti_normalize(const char* cls , CStrRef input, int64 form /* = q_Normalizer___FORM_C */) { STATIC_METHOD_INJECTION_BUILTIN(Normalizer, Normalizer::normalize); s_intl_error->m_error.clear(); int expansion_factor = 1; switch(form) { case UNORM_NONE: case UNORM_NFC: case UNORM_NFKC: break; case UNORM_NFD: case UNORM_NFKD: expansion_factor = 3; break; default: s_intl_error->m_error.code = U_ILLEGAL_ARGUMENT_ERROR; s_intl_error->m_error.custom_error_message = "normalizer_normalize: illegal normalization form"; return null; } /* First convert the string to UTF-16. */ UChar* uinput = NULL; int uinput_len = 0; UErrorCode status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uinput, &uinput_len, input.data(), input.size(), &status); if (U_FAILURE(status)) { s_intl_error->m_error.code = status; s_intl_error->m_error.custom_error_message = "Error converting string to UTF-16."; free(uinput); return null; } /* Allocate memory for the destination buffer for normalization */ int uret_len = uinput_len * expansion_factor; UChar *uret_buf = (UChar*)malloc((uret_len + 1) * sizeof(UChar)); /* normalize */ int size_needed = unorm_normalize(uinput, uinput_len, (UNormalizationMode)form, (int32_t) 0, uret_buf, uret_len, &status); /* Bail out if an unexpected error occured. * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough). * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string * is empty). */ if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) { free(uret_buf); free(uinput); return null; } if (size_needed > uret_len) { /* realloc does not seem to work properly - memory is corrupted * uret_buf = eurealloc(uret_buf, size_needed + 1); */ free(uret_buf); uret_buf = (UChar*)malloc((size_needed + 1) * sizeof(UChar)); uret_len = size_needed; status = U_ZERO_ERROR; /* try normalize again */ size_needed = unorm_normalize( uinput, uinput_len, (UNormalizationMode)form, (int32_t) 0, uret_buf, uret_len, &status); /* Bail out if an unexpected error occured. */ if (U_FAILURE(status)) { /* Set error messages. */ s_intl_error->m_error.code = status; s_intl_error->m_error.custom_error_message = "Error normalizing string"; free(uret_buf); free(uinput); return null; } } free(uinput); /* the buffer we actually used */ uret_len = size_needed; /* Convert normalized string from UTF-16 to UTF-8. */ char* ret_buf = NULL; int ret_len = 0; intl_convert_utf16_to_utf8(&ret_buf, &ret_len, uret_buf, uret_len, &status); free(uret_buf); if (U_FAILURE(status)) { s_intl_error->m_error.code = status; s_intl_error->m_error.custom_error_message = "normalizer_normalize: error converting normalized text UTF-8"; return null; } return String(ret_buf, ret_len, AttachString); }
/** * @param mode same with ICU mode flag UNormalizationMode. */ size_t uni_normalize(char* src, size_t src_len, char* dst, size_t dst_capacity, int mode, int opt){ UNormalizationMode umode = (UNormalizationMode)mode; // status holder UErrorCode ustatus = U_ZERO_ERROR; // UChar source UChar *s; int32_t s_length, s_capacity; // UChar normalized UChar *d; int32_t d_length, d_capacity; // UTF8 normalzied int32_t dst_alloc; // convert UTF-8 -> UChar u_strFromUTF8(NULL, 0, &s_length, src, (int32_t)src_len, &ustatus); if(U_FAILURE(ustatus) && ustatus!=U_BUFFER_OVERFLOW_ERROR){ char buf[1024]; sprintf(buf,"ICU u_strFromUTF8(pre-flighting) error with %d\n", ustatus); fputs(buf, stderr); fflush(stderr); return 0; }else{ ustatus = U_ZERO_ERROR; } s_capacity = (s_length+7)/8*8; // for '\0' termination s = (UChar*)my_malloc(s_capacity*sizeof(UChar), MYF(MY_WME)); if(!s){ fputs("malloc failure\n", stderr); fflush(stderr); return 0; } s = u_strFromUTF8(s, s_length, NULL, src, (int32_t)src_len, &ustatus); if(U_FAILURE(ustatus)){ char buf[1024]; sprintf(buf,"ICU u_strFromUTF8 error with %d\n", ustatus); fputs(buf, stderr); fflush(stderr); my_free(s); return 0; }else{ ustatus = U_ZERO_ERROR; } // normalize d_length = unorm_normalize(s, s_length, umode, (int32_t)opt, NULL, 0, &ustatus); if(U_FAILURE(ustatus) && ustatus!=U_BUFFER_OVERFLOW_ERROR){ char buf[1024]; sprintf(buf,"ICU unorm_normalize(pre-flighting) error with %d\n", ustatus); fputs(buf, stderr); fflush(stderr); my_free(s); return 0; }else{ ustatus = U_ZERO_ERROR; } d_capacity = (d_length+7)/8*8; d = (UChar*)my_malloc(d_capacity*sizeof(UChar), MYF(MY_WME)); if(!d){ fputs("malloc failure\n", stderr); fflush(stderr); my_free(s); return 0; } d_length = unorm_normalize(s, s_length, umode, (int32_t)opt, d, d_capacity, &ustatus); if(U_FAILURE(ustatus)){ char buf[1024]; sprintf(buf,"ICU unorm_normalize error with %d\n", ustatus); fputs(buf, stderr); fflush(stderr); my_free(s); my_free(d); return 0; }else{ ustatus = U_ZERO_ERROR; } my_free(s); // encode UChar -> UTF-8 u_strToUTF8(dst, (int32_t)dst_capacity, &dst_alloc, d, d_length, &ustatus); my_free(d); return (size_t)dst_alloc; }
U_CAPI int32_t U_EXPORT2 uspoof_getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t length, UChar *dest, int32_t destCapacity, UErrorCode *status) { // TODO: this function could be sped up a bit // Skip the input normalization when not needed, work from callers data. // Put the initial skeleton straight into the caller's destination buffer. // It probably won't need normalization. // But these would make the structure more complicated. const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; } if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) || (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } int32_t tableMask = 0; switch (type) { case 0: tableMask = USPOOF_ML_TABLE_FLAG; break; case USPOOF_SINGLE_SCRIPT_CONFUSABLE: tableMask = USPOOF_SL_TABLE_FLAG; break; case USPOOF_ANY_CASE: tableMask = USPOOF_MA_TABLE_FLAG; break; case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE: tableMask = USPOOF_SA_TABLE_FLAG; break; default: *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } // NFD transform of the user supplied input UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE]; UChar *nfdInput = nfdStackBuf; int32_t normalizedLen = unorm_normalize( s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status); if (*status == U_BUFFER_OVERFLOW_ERROR) { nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar)); if (nfdInput == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } *status = U_ZERO_ERROR; normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0, nfdInput, normalizedLen+1, status); } if (U_FAILURE(*status)) { if (nfdInput != nfdStackBuf) { uprv_free(nfdInput); } return 0; } // buffer to hold the Unicode defined skeleton mappings for a single code point UChar buf[USPOOF_MAX_SKELETON_EXPANSION]; // Apply the skeleton mapping to the NFD normalized input string // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. int32_t inputIndex = 0; UnicodeString skelStr; while (inputIndex < normalizedLen) { UChar32 c; U16_NEXT(nfdInput, inputIndex, normalizedLen, c); int32_t replaceLen = This->confusableLookup(c, tableMask, buf); skelStr.append(buf, replaceLen); } if (nfdInput != nfdStackBuf) { uprv_free(nfdInput); } const UChar *result = skelStr.getBuffer(); int32_t resultLen = skelStr.length(); UChar *normedResult = NULL; // Check the skeleton for NFD, normalize it if needed. // Unnormalized results should be very rare. if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) { normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status); normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar))); if (normedResult == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } *status = U_ZERO_ERROR; unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status); result = normedResult; resultLen = normalizedLen; } // Copy the skeleton to the caller's buffer if (U_SUCCESS(*status)) { if (destCapacity == 0 || resultLen > destCapacity) { *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING; } else { u_memcpy(dest, result, resultLen); if (destCapacity > resultLen) { dest[resultLen] = 0; } else { *status = U_STRING_NOT_TERMINATED_WARNING; } } } uprv_free(normedResult); return resultLen; }