示例#1
0
// Helper runs a standard unicode normalization, optional OCR normalization,
// and leaves the result as char32 for subsequent processing.
static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
                                 const char* str8,
                                 std::vector<char32>* normed32) {
  // Convert to ICU string for unicode normalization.
  icu::UnicodeString uch_str(str8, "UTF-8");
  IcuErrorCode error_code;
  // Convert the enum to the new weird icu representation.
  const char* norm_type =
      u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC
          ? "nfkc"
          : "nfc";
  UNormalization2Mode compose =
      u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
          ? UNORM2_COMPOSE
          : UNORM2_DECOMPOSE;
  // Pointer to singleton does not require deletion.
  const icu::Normalizer2* normalizer =
      icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
  error_code.assertSuccess();
  error_code.reset();
  icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
  error_code.assertSuccess();
  // Convert to char32 for output. OCR normalization if required.
  normed32->reserve(norm_str.length());  // An approximation.
  for (int offset = 0; offset < norm_str.length();
       offset = norm_str.moveIndex32(offset, 1)) {
    char32 ch = norm_str.char32At(offset);
    // Skip all ZWS, RTL and LTR marks.
    if (Validator::IsZeroWidthMark(ch)) continue;
    if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
    normed32->push_back(ch);
  }
}
示例#2
0
void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) {
  IcuErrorCode error_code;
  const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
      nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE, error_code);
  error_code.assertSuccess();
  error_code.reset();

  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
  icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
  error_code.assertSuccess();

  str->clear();
  for (int i = 0; i < norm_str.length(); ++i) {
    // If any spaces were added by NFKC, pretend normalization is a nop.
    if (norm_str[i] == ' ') {
      str->clear();
      str->push_back(ch);
      break;
    } else {
      str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
    }
  }
}
示例#3
0
bool IsOCREquivalent(char32 ch1, char32 ch2) {
  return OCRNormalize(ch1) == OCRNormalize(ch2);
}