Beispiel #1
0
String f_icu_transliterate(CStrRef str, bool remove_accents) {
#if HAVE_OLD_LIBICU
  // inspired by the UnicodeString::setToUTF8 implementation
  int32_t length = str.length();
  int32_t bytesWritten=0;
  UnicodeString u_str;
  u_strFromUTF8WithSub(u_str.getBuffer(length+1), length+1, &bytesWritten,
      str.data(), length, 0xfffd, NULL, NULL);
  u_str.releaseBuffer(bytesWritten);
#else
  UnicodeString u_str = UnicodeString::fromUTF8(str.data());
#endif
  if (remove_accents) {
    s_transliterator->transliterate(u_str);
  } else {
    s_transliterator->transliterate_with_accents(u_str);
  }

  // Convert the UnicodeString back into a UTF8 String.
#if HAVE_OLD_LIBICU
  return icuStringToUTF8(u_str);
#else
  int32_t capacity = u_str.countChar32() * sizeof(UChar) + 1;
  char* out = (char *)malloc(capacity);
  CheckedArrayByteSink bs(out, capacity);
  u_str.toUTF8(bs);

  return String(out, AttachString);
#endif
}
Beispiel #2
0
String c_EncodingMatch::t_getutf8() {
  INSTANCE_METHOD_INJECTION_BUILTIN(EncodingMatch, EncodingMatch::getutf8);
  validate();

  UErrorCode status;
  icu::UnicodeString ustr;
  int32_t ustrSize = ustr.getCapacity();

  do {
    status = U_ZERO_ERROR;
    UChar* buf = ustr.getBuffer(ustrSize);
    ustrSize = ucsdet_getUChars(
      m_encoding_match,
      buf,
      ustrSize,
      &status);
    ustr.releaseBuffer();
    ustr.truncate(ustrSize);
  } while (status == U_BUFFER_OVERFLOW_ERROR);

  if (U_FAILURE(status)) {
    throw Exception(
      "Could not get UTF-8 for match, error %d (%s)",
      status, u_errorName(status));
  }
#if HAVE_OLD_LIBICU
  std::string utf8str (icuStringToUTF8(ustr));
#else
  std::string utf8str;
  ustr.toUTF8String(utf8str);
#endif
  return String(utf8str);
}
Beispiel #3
0
/* Returns a list of tokens, but with various normalizations performed
 * based on the token type.
 *
 * Default behavior:
 * Whitespace: dropped (removed from output)
 * Words: converted to lower case
 * Numbers: replaced with #XXX, where the number of X's is based on the
 *          format of the number; any punctuation is maintained
 * Japanese/Chinese scripts: converted to lower case
 * Email: Converted to TOKEN_EMAIL
 * URL: Converted to TOKEN_URL
 * Emoticon: Left as-is
 * Heart: Converted to TOKEN_HEART
 * Exclamation: Replaced with an empty string
 * Date: Replaced with TOKEN_DATE
 * Money: Replaced with TOKEN_MONEY
 * Time: Replaced with TOKEN_TIME
 * Acronym: converted to lower case
 * Other: replaced with empty string
 *
 */
Array f_icu_tokenize(CStrRef text) {
  // Boundary markers that indicate the beginning and end of a token stream.
  const String BEGIN_MARKER("_B_");
  const String END_MARKER("_E_");

  Array ret;
  std::vector<Token> tokens;
  tokenizeString(tokens, getMaster(), UnicodeString::fromUTF8(text.data()));

  int i = 0;
  ret.set(i++, BEGIN_MARKER);
  for(std::vector<Token>::iterator iter = tokens.begin();
      iter != tokens.end();
      iter++) {
    normalizeToken(*iter);
    const UnicodeString& word = iter->value;
    // Ignore spaces and empty strings.
    if(!s_spaceMatcher->matches(word) && word.length() > 0) {
      ret.set(i++, String(icuStringToUTF8(word)));
    }
  }
  ret.set(i++, END_MARKER);
  return ret;
}
Beispiel #4
0
/* Returns a list of tokens, but with various normalizations performed
 * based on the token type.
 *
 * Default behavior:
 * Whitespace: dropped (removed from output)
 * Words: converted to lower case
 * Numbers: replaced with #XXX, where the number of X's is based on the
 *          format of the number; any punctuation is maintained
 * Japanese/Chinese scripts: converted to lower case
 * Email: Converted to TOKEN_EMAIL
 * URL: Converted to TOKEN_URL
 * Emoticon: Left as-is
 * Heart: Converted to TOKEN_HEART
 * Exclamation: Replaced with an empty string
 * Date: Replaced with TOKEN_DATE
 * Money: Replaced with TOKEN_MONEY
 * Time: Replaced with TOKEN_TIME
 * Acronym: converted to lower case
 * Other: replaced with empty string
 *
 */
Array f_icu_tokenize(CStrRef text) {
  // Boundary markers that indicate the beginning and end of a token stream.
  const String BEGIN_MARKER("_B_");
  const String END_MARKER("_E_");

  Array ret;
  std::vector<Token> tokens;
  TAINT_OBSERVER(TAINT_BIT_MUTATED, TAINT_BIT_NONE);
#if HAVE_OLD_LIBICU
  // inspired by the UnicodeString::setToUTF8 implementation
  int32_t length = text.length();
  int32_t bytesWritten=0;
  UnicodeString input;
  u_strFromUTF8WithSub(input.getBuffer(length+1), length+1, &bytesWritten,
      text.data(), length, 0xfffd, NULL, NULL);
  input.releaseBuffer(bytesWritten);
  tokenizeString(tokens, HPHP::kMaster, input);
#else
  tokenizeString(tokens, HPHP::kMaster, UnicodeString::fromUTF8(text.data()));
#endif

  int i = 0;
  ret.set(i++, BEGIN_MARKER);
  for(std::vector<Token>::iterator iter = tokens.begin();
      iter != tokens.end();
      iter++) {
    normalizeToken(*iter);
    const UnicodeString& word = iter->value;
    // Ignore spaces and empty strings.
    if(!s_spaceMatcher->matches(word) && word.length() > 0) {
      ret.set(i++, String(icuStringToUTF8(word)));
    }
  }
  ret.set(i++, END_MARKER);
  return ret;
}