String f_icu_transliterate(CStrRef str, bool remove_accents) { #if HAVE_OLD_LIBICU // inspired by the UnicodeString::setToUTF8 implementation int32_t length = str.length(); int32_t bytesWritten=0; UnicodeString u_str; u_strFromUTF8WithSub(u_str.getBuffer(length+1), length+1, &bytesWritten, str.data(), length, 0xfffd, NULL, NULL); u_str.releaseBuffer(bytesWritten); #else UnicodeString u_str = UnicodeString::fromUTF8(str.data()); #endif if (remove_accents) { s_transliterator->transliterate(u_str); } else { s_transliterator->transliterate_with_accents(u_str); } // Convert the UnicodeString back into a UTF8 String. #if HAVE_OLD_LIBICU return icuStringToUTF8(u_str); #else int32_t capacity = u_str.countChar32() * sizeof(UChar) + 1; char* out = (char *)malloc(capacity); CheckedArrayByteSink bs(out, capacity); u_str.toUTF8(bs); return String(out, AttachString); #endif }
String c_EncodingMatch::t_getutf8() { INSTANCE_METHOD_INJECTION_BUILTIN(EncodingMatch, EncodingMatch::getutf8); validate(); UErrorCode status; icu::UnicodeString ustr; int32_t ustrSize = ustr.getCapacity(); do { status = U_ZERO_ERROR; UChar* buf = ustr.getBuffer(ustrSize); ustrSize = ucsdet_getUChars( m_encoding_match, buf, ustrSize, &status); ustr.releaseBuffer(); ustr.truncate(ustrSize); } while (status == U_BUFFER_OVERFLOW_ERROR); if (U_FAILURE(status)) { throw Exception( "Could not get UTF-8 for match, error %d (%s)", status, u_errorName(status)); } #if HAVE_OLD_LIBICU std::string utf8str (icuStringToUTF8(ustr)); #else std::string utf8str; ustr.toUTF8String(utf8str); #endif return String(utf8str); }
/* Returns a list of tokens, but with various normalizations performed * based on the token type. * * Default behavior: * Whitespace: dropped (removed from output) * Words: converted to lower case * Numbers: replaced with #XXX, where the number of X's is based on the * format of the number; any punctuation is maintained * Japanese/Chinese scripts: converted to lower case * Email: Converted to TOKEN_EMAIL * URL: Converted to TOKEN_URL * Emoticon: Left as-is * Heart: Converted to TOKEN_HEART * Exclamation: Replaced with an empty string * Date: Replaced with TOKEN_DATE * Money: Replaced with TOKEN_MONEY * Time: Replaced with TOKEN_TIME * Acronym: converted to lower case * Other: replaced with empty string * */ Array f_icu_tokenize(CStrRef text) { // Boundary markers that indicate the beginning and end of a token stream. const String BEGIN_MARKER("_B_"); const String END_MARKER("_E_"); Array ret; std::vector<Token> tokens; tokenizeString(tokens, getMaster(), UnicodeString::fromUTF8(text.data())); int i = 0; ret.set(i++, BEGIN_MARKER); for(std::vector<Token>::iterator iter = tokens.begin(); iter != tokens.end(); iter++) { normalizeToken(*iter); const UnicodeString& word = iter->value; // Ignore spaces and empty strings. if(!s_spaceMatcher->matches(word) && word.length() > 0) { ret.set(i++, String(icuStringToUTF8(word))); } } ret.set(i++, END_MARKER); return ret; }
/* Returns a list of tokens, but with various normalizations performed * based on the token type. * * Default behavior: * Whitespace: dropped (removed from output) * Words: converted to lower case * Numbers: replaced with #XXX, where the number of X's is based on the * format of the number; any punctuation is maintained * Japanese/Chinese scripts: converted to lower case * Email: Converted to TOKEN_EMAIL * URL: Converted to TOKEN_URL * Emoticon: Left as-is * Heart: Converted to TOKEN_HEART * Exclamation: Replaced with an empty string * Date: Replaced with TOKEN_DATE * Money: Replaced with TOKEN_MONEY * Time: Replaced with TOKEN_TIME * Acronym: converted to lower case * Other: replaced with empty string * */ Array f_icu_tokenize(CStrRef text) { // Boundary markers that indicate the beginning and end of a token stream. const String BEGIN_MARKER("_B_"); const String END_MARKER("_E_"); Array ret; std::vector<Token> tokens; TAINT_OBSERVER(TAINT_BIT_MUTATED, TAINT_BIT_NONE); #if HAVE_OLD_LIBICU // inspired by the UnicodeString::setToUTF8 implementation int32_t length = text.length(); int32_t bytesWritten=0; UnicodeString input; u_strFromUTF8WithSub(input.getBuffer(length+1), length+1, &bytesWritten, text.data(), length, 0xfffd, NULL, NULL); input.releaseBuffer(bytesWritten); tokenizeString(tokens, HPHP::kMaster, input); #else tokenizeString(tokens, HPHP::kMaster, UnicodeString::fromUTF8(text.data())); #endif int i = 0; ret.set(i++, BEGIN_MARKER); for(std::vector<Token>::iterator iter = tokens.begin(); iter != tokens.end(); iter++) { normalizeToken(*iter); const UnicodeString& word = iter->value; // Ignore spaces and empty strings. if(!s_spaceMatcher->matches(word) && word.length() > 0) { ret.set(i++, String(icuStringToUTF8(word))); } } ret.set(i++, END_MARKER); return ret; }