// Helper runs a standard unicode normalization, optional OCR normalization, // and leaves the result as char32 for subsequent processing. static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize, const char* str8, std::vector<char32>* normed32) { // Convert to ICU string for unicode normalization. icu::UnicodeString uch_str(str8, "UTF-8"); IcuErrorCode error_code; // Convert the enum to the new weird icu representation. const char* norm_type = u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC ? "nfkc" : "nfc"; UNormalization2Mode compose = u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC ? UNORM2_COMPOSE : UNORM2_DECOMPOSE; // Pointer to singleton does not require deletion. const icu::Normalizer2* normalizer = icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code); error_code.assertSuccess(); error_code.reset(); icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code); error_code.assertSuccess(); // Convert to char32 for output. OCR normalization if required. normed32->reserve(norm_str.length()); // An approximation. for (int offset = 0; offset < norm_str.length(); offset = norm_str.moveIndex32(offset, 1)) { char32 ch = norm_str.char32At(offset); // Skip all ZWS, RTL and LTR marks. if (Validator::IsZeroWidthMark(ch)) continue; if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch); normed32->push_back(ch); } }
char32 FullwidthToHalfwidth(const char32 ch) { // Return unchanged if not in the fullwidth-halfwidth Unicode block. if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) { if (ch != 0x3000) return ch; } // Special case for fullwidth left and right "white parentheses". if (ch == 0xFF5F) return 0x2985; if (ch == 0xFF60) return 0x2986; // Construct a full-to-half width transliterator. IcuErrorCode error_code; icu::UnicodeString uch_str(static_cast<UChar32>(ch)); const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance( "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code); error_code.assertSuccess(); error_code.reset(); fulltohalf->transliterate(uch_str); delete fulltohalf; ASSERT_HOST(uch_str.length() != 0); return uch_str[0]; }
void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) { IcuErrorCode error_code; const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance( nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE, error_code); error_code.assertSuccess(); error_code.reset(); icu::UnicodeString uch_str(static_cast<UChar32>(ch)); icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code); error_code.assertSuccess(); str->clear(); for (int i = 0; i < norm_str.length(); ++i) { // If any spaces were added by NFKC, pretend normalization is a nop. if (norm_str[i] == ' ') { str->clear(); str->push_back(ch); break; } else { str->push_back(OCRNormalize(static_cast<char32>(norm_str[i]))); } } }