C++ (Cpp) uch_str 예제들

프로그래밍 언어: C++ (Cpp)

메소드/함수: uch_str

hotexamples.com에서의 예제들: 3

C++ (Cpp) uch_str - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C++ (Cpp)의 uch_str에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: normstrngs.cpp 프로젝트: jan-ruzicka/tesseract

// Helper runs a standard unicode normalization, optional OCR normalization,
// and leaves the result as char32 for subsequent processing.
static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
                                 const char* str8,
                                 std::vector<char32>* normed32) {
  // Convert to ICU string for unicode normalization.
  icu::UnicodeString uch_str(str8, "UTF-8");
  IcuErrorCode error_code;
  // Convert the enum to the new weird icu representation.
  const char* norm_type =
      u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC
          ? "nfkc"
          : "nfc";
  UNormalization2Mode compose =
      u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
          ? UNORM2_COMPOSE
          : UNORM2_DECOMPOSE;
  // Pointer to singleton does not require deletion.
  const icu::Normalizer2* normalizer =
      icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
  error_code.assertSuccess();
  error_code.reset();
  icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
  error_code.assertSuccess();
  // Convert to char32 for output. OCR normalization if required.
  normed32->reserve(norm_str.length());  // An approximation.
  for (int offset = 0; offset < norm_str.length();
       offset = norm_str.moveIndex32(offset, 1)) {
    char32 ch = norm_str.char32At(offset);
    // Skip all ZWS, RTL and LTR marks.
    if (Validator::IsZeroWidthMark(ch)) continue;
    if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
    normed32->push_back(ch);
  }
}

예제 #2

파일 보기

파일: normstrngs.cpp 프로젝트: hoiqs/tesseract-ocr

char32 FullwidthToHalfwidth(const char32 ch) {
  // Return unchanged if not in the fullwidth-halfwidth Unicode block.
  if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
    if (ch != 0x3000) return ch;
  }
  // Special case for fullwidth left and right "white parentheses".
  if (ch == 0xFF5F) return 0x2985;
  if (ch == 0xFF60) return 0x2986;
  // Construct a full-to-half width transliterator.
  IcuErrorCode error_code;
  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
  const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
      "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
  error_code.assertSuccess();
  error_code.reset();

  fulltohalf->transliterate(uch_str);
  delete fulltohalf;
  ASSERT_HOST(uch_str.length() != 0);
  return uch_str[0];
}

예제 #3

파일 보기

파일: normstrngs.cpp 프로젝트: hoiqs/tesseract-ocr

void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) {
  IcuErrorCode error_code;
  const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
      nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE, error_code);
  error_code.assertSuccess();
  error_code.reset();

  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
  icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
  error_code.assertSuccess();

  str->clear();
  for (int i = 0; i < norm_str.length(); ++i) {
    // If any spaces were added by NFKC, pretend normalization is a nop.
    if (norm_str[i] == ' ') {
      str->clear();
      str->push_back(ch);
      break;
    } else {
      str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
    }
  }
}