C++ (Cpp) icuStringToUTF8 예제들

프로그래밍 언어: C++ (Cpp)

메소드/함수: icuStringToUTF8

hotexamples.com에서의 예제들: 4

C++ (Cpp) icuStringToUTF8 - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C++ (Cpp)의 icuStringToUTF8에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: ext_icu.cpp 프로젝트: d1saster/hiphop-php

String f_icu_transliterate(CStrRef str, bool remove_accents) {
#if HAVE_OLD_LIBICU
  // inspired by the UnicodeString::setToUTF8 implementation
  int32_t length = str.length();
  int32_t bytesWritten=0;
  UnicodeString u_str;
  u_strFromUTF8WithSub(u_str.getBuffer(length+1), length+1, &bytesWritten,
      str.data(), length, 0xfffd, NULL, NULL);
  u_str.releaseBuffer(bytesWritten);
#else
  UnicodeString u_str = UnicodeString::fromUTF8(str.data());
#endif
  if (remove_accents) {
    s_transliterator->transliterate(u_str);
  } else {
    s_transliterator->transliterate_with_accents(u_str);
  }

  // Convert the UnicodeString back into a UTF8 String.
#if HAVE_OLD_LIBICU
  return icuStringToUTF8(u_str);
#else
  int32_t capacity = u_str.countChar32() * sizeof(UChar) + 1;
  char* out = (char *)malloc(capacity);
  CheckedArrayByteSink bs(out, capacity);
  u_str.toUTF8(bs);

  return String(out, AttachString);
#endif
}

예제 #2

파일 보기

파일: ext_icu_ucsdet.cpp 프로젝트: d1saster/hiphop-php

String c_EncodingMatch::t_getutf8() {
  INSTANCE_METHOD_INJECTION_BUILTIN(EncodingMatch, EncodingMatch::getutf8);
  validate();

  UErrorCode status;
  icu::UnicodeString ustr;
  int32_t ustrSize = ustr.getCapacity();

  do {
    status = U_ZERO_ERROR;
    UChar* buf = ustr.getBuffer(ustrSize);
    ustrSize = ucsdet_getUChars(
      m_encoding_match,
      buf,
      ustrSize,
      &status);
    ustr.releaseBuffer();
    ustr.truncate(ustrSize);
  } while (status == U_BUFFER_OVERFLOW_ERROR);

  if (U_FAILURE(status)) {
    throw Exception(
      "Could not get UTF-8 for match, error %d (%s)",
      status, u_errorName(status));
  }
#if HAVE_OLD_LIBICU
  std::string utf8str (icuStringToUTF8(ustr));
#else
  std::string utf8str;
  ustr.toUTF8String(utf8str);
#endif
  return String(utf8str);
}

예제 #3

파일 보기

파일: ext_icu.cpp 프로젝트: HendrikGrunstra/hiphop-php

/* Returns a list of tokens, but with various normalizations performed
 * based on the token type.
 *
 * Default behavior:
 * Whitespace: dropped (removed from output)
 * Words: converted to lower case
 * Numbers: replaced with #XXX, where the number of X's is based on the
 *          format of the number; any punctuation is maintained
 * Japanese/Chinese scripts: converted to lower case
 * Email: Converted to TOKEN_EMAIL
 * URL: Converted to TOKEN_URL
 * Emoticon: Left as-is
 * Heart: Converted to TOKEN_HEART
 * Exclamation: Replaced with an empty string
 * Date: Replaced with TOKEN_DATE
 * Money: Replaced with TOKEN_MONEY
 * Time: Replaced with TOKEN_TIME
 * Acronym: converted to lower case
 * Other: replaced with empty string
 *
 */
Array f_icu_tokenize(CStrRef text) {
  // Boundary markers that indicate the beginning and end of a token stream.
  const String BEGIN_MARKER("_B_");
  const String END_MARKER("_E_");

  Array ret;
  std::vector<Token> tokens;
  tokenizeString(tokens, getMaster(), UnicodeString::fromUTF8(text.data()));

  int i = 0;
  ret.set(i++, BEGIN_MARKER);
  for(std::vector<Token>::iterator iter = tokens.begin();
      iter != tokens.end();
      iter++) {
    normalizeToken(*iter);
    const UnicodeString& word = iter->value;
    // Ignore spaces and empty strings.
    if(!s_spaceMatcher->matches(word) && word.length() > 0) {
      ret.set(i++, String(icuStringToUTF8(word)));
    }
  }
  ret.set(i++, END_MARKER);
  return ret;
}

예제 #4

파일 보기

파일: ext_icu.cpp 프로젝트: d1saster/hiphop-php

/* Returns a list of tokens, but with various normalizations performed
 * based on the token type.
 *
 * Default behavior:
 * Whitespace: dropped (removed from output)
 * Words: converted to lower case
 * Numbers: replaced with #XXX, where the number of X's is based on the
 *          format of the number; any punctuation is maintained
 * Japanese/Chinese scripts: converted to lower case
 * Email: Converted to TOKEN_EMAIL
 * URL: Converted to TOKEN_URL
 * Emoticon: Left as-is
 * Heart: Converted to TOKEN_HEART
 * Exclamation: Replaced with an empty string
 * Date: Replaced with TOKEN_DATE
 * Money: Replaced with TOKEN_MONEY
 * Time: Replaced with TOKEN_TIME
 * Acronym: converted to lower case
 * Other: replaced with empty string
 *
 */
Array f_icu_tokenize(CStrRef text) {
  // Boundary markers that indicate the beginning and end of a token stream.
  const String BEGIN_MARKER("_B_");
  const String END_MARKER("_E_");

  Array ret;
  std::vector<Token> tokens;
  TAINT_OBSERVER(TAINT_BIT_MUTATED, TAINT_BIT_NONE);
#if HAVE_OLD_LIBICU
  // inspired by the UnicodeString::setToUTF8 implementation
  int32_t length = text.length();
  int32_t bytesWritten=0;
  UnicodeString input;
  u_strFromUTF8WithSub(input.getBuffer(length+1), length+1, &bytesWritten,
      text.data(), length, 0xfffd, NULL, NULL);
  input.releaseBuffer(bytesWritten);
  tokenizeString(tokens, HPHP::kMaster, input);
#else
  tokenizeString(tokens, HPHP::kMaster, UnicodeString::fromUTF8(text.data()));
#endif

  int i = 0;
  ret.set(i++, BEGIN_MARKER);
  for(std::vector<Token>::iterator iter = tokens.begin();
      iter != tokens.end();
      iter++) {
    normalizeToken(*iter);
    const UnicodeString& word = iter->value;
    // Ignore spaces and empty strings.
    if(!s_spaceMatcher->matches(word) && word.length() > 0) {
      ret.set(i++, String(icuStringToUTF8(word)));
    }
  }
  ret.set(i++, END_MARKER);
  return ret;
}