Example #1
0
File: icu.cpp Project: bd808/hhvm
icu::UnicodeString u16(const char *u8, int32_t u8_len, UErrorCode &error,
                       UChar32 subst /* =0 */) {
  error = U_ZERO_ERROR;
  if (u8_len == 0) {
    return icu::UnicodeString();
  }
  int32_t outlen;
  if (subst) {
    u_strFromUTF8WithSub(nullptr, 0, &outlen, u8, u8_len,
                         subst, nullptr, &error);
  } else {
    u_strFromUTF8(nullptr, 0, &outlen, u8, u8_len, &error);
  }
  if (error != U_BUFFER_OVERFLOW_ERROR) {
    return icu::UnicodeString();
  }
  icu::UnicodeString ret;
  auto out = ret.getBuffer(outlen + 1);
  error = U_ZERO_ERROR;
  if (subst) {
    u_strFromUTF8WithSub(out, outlen + 1, &outlen, u8, u8_len,
                         subst, nullptr, &error);
  } else {
    u_strFromUTF8(out, outlen + 1, &outlen, u8, u8_len, &error);
  }
  ret.releaseBuffer(outlen);
  if (U_FAILURE(error)) {
    return icu::UnicodeString();
  }
  return ret;
}
static UChar *
unicode_from_pg_text (text *pg_input)
{
    UErrorCode status = U_ZERO_ERROR;

    char *input = VARDATA (pg_input);
    int32_t len = VARSIZE (pg_input) - VARHDRSZ;

    UChar *ret = (UChar *) palloc (sizeof (UChar) * PREALLOC_SIZE);
    int32_t size;

    u_strFromUTF8WithSub (ret, PREALLOC_SIZE, &size, input, len, 0xFFFD, NULL, &status);

    /* always allocate 1 character more than neccesary, this ensures that
     * u_strFromUTF8WithSub() will write a zero-terminated string. */
    if (++size > PREALLOC_SIZE)
    {
        pfree (ret);
        ret = (UChar *) palloc (sizeof (UChar) * size);

        status = U_ZERO_ERROR;
        u_strFromUTF8WithSub (ret, size, NULL, input, len, 0xFFFD, NULL, &status);
    }

    if (icu_failure (status))
    {
        pfree (ret);
        return NULL;
    }

    return ret;
}
Example #3
0
String f_icu_transliterate(CStrRef str, bool remove_accents) {
#if HAVE_OLD_LIBICU
  // inspired by the UnicodeString::setToUTF8 implementation
  int32_t length = str.length();
  int32_t bytesWritten=0;
  UnicodeString u_str;
  u_strFromUTF8WithSub(u_str.getBuffer(length+1), length+1, &bytesWritten,
      str.data(), length, 0xfffd, NULL, NULL);
  u_str.releaseBuffer(bytesWritten);
#else
  UnicodeString u_str = UnicodeString::fromUTF8(str.data());
#endif
  if (remove_accents) {
    s_transliterator->transliterate(u_str);
  } else {
    s_transliterator->transliterate_with_accents(u_str);
  }

  // Convert the UnicodeString back into a UTF8 String.
#if HAVE_OLD_LIBICU
  return icuStringToUTF8(u_str);
#else
  int32_t capacity = u_str.countChar32() * sizeof(UChar) + 1;
  char* out = (char *)malloc(capacity);
  CheckedArrayByteSink bs(out, capacity);
  u_str.toUTF8(bs);

  return String(out, AttachString);
#endif
}
Example #4
0
/* {{{ intl_stringFromChar */
int intl_stringFromChar(UnicodeString &ret, char *str, size_t str_len, UErrorCode *status)
{
	if(str_len > INT32_MAX) {
		*status = U_BUFFER_OVERFLOW_ERROR;
		ret.setToBogus();
		return FAILURE;
	}
	//the number of UTF-16 code units is not larger than that of UTF-8 code
	//units, + 1 for the terminator
	int32_t capacity = (int32_t)str_len + 1;

	//no check necessary -- if NULL will fail ahead
	UChar	*utf16 = ret.getBuffer(capacity);
	int32_t utf16_len = 0;
	*status = U_ZERO_ERROR;
	u_strFromUTF8WithSub(utf16, ret.getCapacity(), &utf16_len,
		str, str_len, U_SENTINEL /* no substitution */, NULL,
		status);
	ret.releaseBuffer(utf16_len);
	if (U_FAILURE(*status)) {
		ret.setToBogus();
		return FAILURE;
	}
	return SUCCESS;
}
Example #5
0
bool ustring_from_char(icu::UnicodeString& ret,
                       const String& str,
                       UErrorCode &error) {
  int32_t capacity = str.size() + 1;
  UChar *utf16 = ret.getBuffer(capacity);
  int32_t utf16_len = 0;
  error = U_ZERO_ERROR;
  u_strFromUTF8WithSub(utf16, ret.getCapacity(), &utf16_len,
                       str.c_str(), str.size(),
                       U_SENTINEL /* no substitution */,
                       nullptr, &error);
  ret.releaseBuffer(utf16_len);
  if (U_FAILURE(error)) {
    ret.setToBogus();
    return false;
  }
  return true;
}
Example #6
0
/* Returns a list of tokens, but with various normalizations performed
 * based on the token type.
 *
 * Default behavior:
 * Whitespace: dropped (removed from output)
 * Words: converted to lower case
 * Numbers: replaced with #XXX, where the number of X's is based on the
 *          format of the number; any punctuation is maintained
 * Japanese/Chinese scripts: converted to lower case
 * Email: Converted to TOKEN_EMAIL
 * URL: Converted to TOKEN_URL
 * Emoticon: Left as-is
 * Heart: Converted to TOKEN_HEART
 * Exclamation: Replaced with an empty string
 * Date: Replaced with TOKEN_DATE
 * Money: Replaced with TOKEN_MONEY
 * Time: Replaced with TOKEN_TIME
 * Acronym: converted to lower case
 * Other: replaced with empty string
 *
 */
Array f_icu_tokenize(CStrRef text) {
  // Boundary markers that indicate the beginning and end of a token stream.
  const String BEGIN_MARKER("_B_");
  const String END_MARKER("_E_");

  Array ret;
  std::vector<Token> tokens;
  TAINT_OBSERVER(TAINT_BIT_MUTATED, TAINT_BIT_NONE);
#if HAVE_OLD_LIBICU
  // inspired by the UnicodeString::setToUTF8 implementation
  int32_t length = text.length();
  int32_t bytesWritten=0;
  UnicodeString input;
  u_strFromUTF8WithSub(input.getBuffer(length+1), length+1, &bytesWritten,
      text.data(), length, 0xfffd, NULL, NULL);
  input.releaseBuffer(bytesWritten);
  tokenizeString(tokens, HPHP::kMaster, input);
#else
  tokenizeString(tokens, HPHP::kMaster, UnicodeString::fromUTF8(text.data()));
#endif

  int i = 0;
  ret.set(i++, BEGIN_MARKER);
  for(std::vector<Token>::iterator iter = tokens.begin();
      iter != tokens.end();
      iter++) {
    normalizeToken(*iter);
    const UnicodeString& word = iter->value;
    // Ignore spaces and empty strings.
    if(!s_spaceMatcher->matches(word) && word.length() > 0) {
      ret.set(i++, String(icuStringToUTF8(word)));
    }
  }
  ret.set(i++, END_MARKER);
  return ret;
}