bool ReadUnicodeCharacter(const char16* src, int32 src_len, int32* char_index, uint32* code_point) { if(CBU16_IS_SURROGATE(src[*char_index])) { if(!CBU16_IS_SURROGATE_LEAD(src[*char_index]) || *char_index+1>=src_len || !CBU16_IS_TRAIL(src[*char_index+1])) { // 非法的高代理对. return false; } // 合法的高代理对. *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index], src[*char_index+1]); (*char_index)++; } else { // 不是高代理, 16-bit字长. *code_point = src[*char_index]; } return IsValidCodepoint(*code_point); }
bool ReadUnicodeCharacter(const char* src, int32 src_len, int32* char_index, uint32* code_point) { // U8_NEXT使用-1表示错误, 因此code_point使用有符号类型. // 函数出错时返回false, 因此code_point使用无符号类型. int32 cp; CBU8_NEXT(src, *char_index, src_len, cp); *code_point = static_cast<uint32>(cp); // 上面的ICU宏移到下一字符, 函数要求移到最后一个用掉的字符. (*char_index)--; // 验证解码值的合法性 . return IsValidCodepoint(cp); }
char32 FullwidthToHalfwidth(const char32 ch) { // Return unchanged if not in the fullwidth-halfwidth Unicode block. if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) { if (ch != 0x3000) return ch; } // Special case for fullwidth left and right "white parentheses". if (ch == 0xFF5F) return 0x2985; if (ch == 0xFF60) return 0x2986; // Construct a full-to-half width transliterator. IcuErrorCode error_code; icu::UnicodeString uch_str(static_cast<UChar32>(ch)); const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance( "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code); error_code.assertSuccess(); error_code.reset(); fulltohalf->transliterate(uch_str); delete fulltohalf; ASSERT_HOST(uch_str.length() != 0); return uch_str[0]; }
bool IsInterchangeValid(const char32 ch) { return IsValidCodepoint(ch) && !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters. !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) && !(ch >= 0x2FFFE && ch <= 0x2FFFF) && !(ch >= 0x3FFFE && ch <= 0x3FFFF) && !(ch >= 0x4FFFE && ch <= 0x4FFFF) && !(ch >= 0x5FFFE && ch <= 0x5FFFF) && !(ch >= 0x6FFFE && ch <= 0x6FFFF) && !(ch >= 0x7FFFE && ch <= 0x7FFFF) && !(ch >= 0x8FFFE && ch <= 0x8FFFF) && !(ch >= 0x9FFFE && ch <= 0x9FFFF) && !(ch >= 0xAFFFE && ch <= 0xAFFFF) && !(ch >= 0xBFFFE && ch <= 0xBFFFF) && !(ch >= 0xCFFFE && ch <= 0xCFFFF) && !(ch >= 0xDFFFE && ch <= 0xDFFFF) && !(ch >= 0xEFFFE && ch <= 0xEFFFF) && !(ch >= 0xFFFFE && ch <= 0xFFFFF) && !(ch >= 0x10FFFE && ch <= 0x10FFFF) && (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r'); }
void TruncateUTF8ToByteSize(const std::string& input, const size_t byte_size, std::string* output) { DCHECK(output); if (byte_size > input.length()) { *output = input; return; } DCHECK_LE(byte_size, static_cast<uint32>(kint32max)); // Note: This cast is necessary because CBU8_NEXT uses int32s. int32 truncation_length = static_cast<int32>(byte_size); int32 char_index = truncation_length - 1; const char* data = input.data(); // Using CBU8, we will move backwards from the truncation point // to the beginning of the string looking for a valid UTF8 // character. Once a full UTF8 character is found, we will // truncate the string to the end of that character. while (char_index >= 0) { int32 prev = char_index; base_icu::UChar32 code_point = 0; CBU8_NEXT(data, char_index, truncation_length, code_point); if (!IsValidCharacter(code_point) || !IsValidCodepoint(code_point)) { char_index = prev - 1; } else { break; } } if (char_index >= 0) *output = input.substr(0, char_index); else output->clear(); }
bool IsInterchangeValid7BitAscii(const char32 ch) { return IsValidCodepoint(ch) && ch <= 128 && (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r'); }
bool IsWhitespace(const char32 ch) { ASSERT_HOST_MSG(IsValidCodepoint(ch), "Invalid Unicode codepoint: 0x%x\n", ch); return u_isUWhiteSpace(static_cast<UChar32>(ch)); }