icu::UnicodeString u16(const char *u8, int32_t u8_len, UErrorCode &error, UChar32 subst /* =0 */) { error = U_ZERO_ERROR; if (u8_len == 0) { return icu::UnicodeString(); } int32_t outlen; if (subst) { u_strFromUTF8WithSub(nullptr, 0, &outlen, u8, u8_len, subst, nullptr, &error); } else { u_strFromUTF8(nullptr, 0, &outlen, u8, u8_len, &error); } if (error != U_BUFFER_OVERFLOW_ERROR) { return icu::UnicodeString(); } icu::UnicodeString ret; auto out = ret.getBuffer(outlen + 1); error = U_ZERO_ERROR; if (subst) { u_strFromUTF8WithSub(out, outlen + 1, &outlen, u8, u8_len, subst, nullptr, &error); } else { u_strFromUTF8(out, outlen + 1, &outlen, u8, u8_len, &error); } ret.releaseBuffer(outlen); if (U_FAILURE(error)) { return icu::UnicodeString(); } return ret; }
static UChar * unicode_from_pg_text (text *pg_input) { UErrorCode status = U_ZERO_ERROR; char *input = VARDATA (pg_input); int32_t len = VARSIZE (pg_input) - VARHDRSZ; UChar *ret = (UChar *) palloc (sizeof (UChar) * PREALLOC_SIZE); int32_t size; u_strFromUTF8WithSub (ret, PREALLOC_SIZE, &size, input, len, 0xFFFD, NULL, &status); /* always allocate 1 character more than neccesary, this ensures that * u_strFromUTF8WithSub() will write a zero-terminated string. */ if (++size > PREALLOC_SIZE) { pfree (ret); ret = (UChar *) palloc (sizeof (UChar) * size); status = U_ZERO_ERROR; u_strFromUTF8WithSub (ret, size, NULL, input, len, 0xFFFD, NULL, &status); } if (icu_failure (status)) { pfree (ret); return NULL; } return ret; }
String f_icu_transliterate(CStrRef str, bool remove_accents) { #if HAVE_OLD_LIBICU // inspired by the UnicodeString::setToUTF8 implementation int32_t length = str.length(); int32_t bytesWritten=0; UnicodeString u_str; u_strFromUTF8WithSub(u_str.getBuffer(length+1), length+1, &bytesWritten, str.data(), length, 0xfffd, NULL, NULL); u_str.releaseBuffer(bytesWritten); #else UnicodeString u_str = UnicodeString::fromUTF8(str.data()); #endif if (remove_accents) { s_transliterator->transliterate(u_str); } else { s_transliterator->transliterate_with_accents(u_str); } // Convert the UnicodeString back into a UTF8 String. #if HAVE_OLD_LIBICU return icuStringToUTF8(u_str); #else int32_t capacity = u_str.countChar32() * sizeof(UChar) + 1; char* out = (char *)malloc(capacity); CheckedArrayByteSink bs(out, capacity); u_str.toUTF8(bs); return String(out, AttachString); #endif }
/* {{{ intl_stringFromChar */ int intl_stringFromChar(UnicodeString &ret, char *str, size_t str_len, UErrorCode *status) { if(str_len > INT32_MAX) { *status = U_BUFFER_OVERFLOW_ERROR; ret.setToBogus(); return FAILURE; } //the number of UTF-16 code units is not larger than that of UTF-8 code //units, + 1 for the terminator int32_t capacity = (int32_t)str_len + 1; //no check necessary -- if NULL will fail ahead UChar *utf16 = ret.getBuffer(capacity); int32_t utf16_len = 0; *status = U_ZERO_ERROR; u_strFromUTF8WithSub(utf16, ret.getCapacity(), &utf16_len, str, str_len, U_SENTINEL /* no substitution */, NULL, status); ret.releaseBuffer(utf16_len); if (U_FAILURE(*status)) { ret.setToBogus(); return FAILURE; } return SUCCESS; }
bool ustring_from_char(icu::UnicodeString& ret, const String& str, UErrorCode &error) { int32_t capacity = str.size() + 1; UChar *utf16 = ret.getBuffer(capacity); int32_t utf16_len = 0; error = U_ZERO_ERROR; u_strFromUTF8WithSub(utf16, ret.getCapacity(), &utf16_len, str.c_str(), str.size(), U_SENTINEL /* no substitution */, nullptr, &error); ret.releaseBuffer(utf16_len); if (U_FAILURE(error)) { ret.setToBogus(); return false; } return true; }
/* Returns a list of tokens, but with various normalizations performed * based on the token type. * * Default behavior: * Whitespace: dropped (removed from output) * Words: converted to lower case * Numbers: replaced with #XXX, where the number of X's is based on the * format of the number; any punctuation is maintained * Japanese/Chinese scripts: converted to lower case * Email: Converted to TOKEN_EMAIL * URL: Converted to TOKEN_URL * Emoticon: Left as-is * Heart: Converted to TOKEN_HEART * Exclamation: Replaced with an empty string * Date: Replaced with TOKEN_DATE * Money: Replaced with TOKEN_MONEY * Time: Replaced with TOKEN_TIME * Acronym: converted to lower case * Other: replaced with empty string * */ Array f_icu_tokenize(CStrRef text) { // Boundary markers that indicate the beginning and end of a token stream. const String BEGIN_MARKER("_B_"); const String END_MARKER("_E_"); Array ret; std::vector<Token> tokens; TAINT_OBSERVER(TAINT_BIT_MUTATED, TAINT_BIT_NONE); #if HAVE_OLD_LIBICU // inspired by the UnicodeString::setToUTF8 implementation int32_t length = text.length(); int32_t bytesWritten=0; UnicodeString input; u_strFromUTF8WithSub(input.getBuffer(length+1), length+1, &bytesWritten, text.data(), length, 0xfffd, NULL, NULL); input.releaseBuffer(bytesWritten); tokenizeString(tokens, HPHP::kMaster, input); #else tokenizeString(tokens, HPHP::kMaster, UnicodeString::fromUTF8(text.data())); #endif int i = 0; ret.set(i++, BEGIN_MARKER); for(std::vector<Token>::iterator iter = tokens.begin(); iter != tokens.end(); iter++) { normalizeToken(*iter); const UnicodeString& word = iter->value; // Ignore spaces and empty strings. if(!s_spaceMatcher->matches(word) && word.length() > 0) { ret.set(i++, String(icuStringToUTF8(word))); } } ret.set(i++, END_MARKER); return ret; }