base_icu::UChar32 operator()(const char** p, const char* end) { base_icu::UChar32 c; int offset = 0; CBU8_NEXT(*p, offset, end-*p, c); *p += offset; return c; }
bool IsStringUTF8(const std::string& str) { const char *src = str.data(); int32 src_len = static_cast<int32>(str.length()); int32 char_index = 0; while (char_index < src_len) { int32 code_point; CBU8_NEXT(src, char_index, src_len, code_point); if (!IsValidCharacter(code_point)) return false; } return true; }
bool ReadUnicodeCharacter(const char* src, int32 src_len, int32* char_index, uint32* code_point) { // U8_NEXT使用-1表示错误, 因此code_point使用有符号类型. // 函数出错时返回false, 因此code_point使用无符号类型. int32 cp; CBU8_NEXT(src, *char_index, src_len, cp); *code_point = static_cast<uint32>(cp); // 上面的ICU宏移到下一字符, 函数要求移到最后一个用掉的字符. (*char_index)--; // 验证解码值的合法性 . return IsValidCodepoint(cp); }
bool ReadUTFChar(const char* str, int* begin, int length, unsigned* code_point_out) { int code_point; // Avoids warning when U8_NEXT writes -1 to it. CBU8_NEXT(str, *begin, length, code_point); *code_point_out = static_cast<unsigned>(code_point); // The ICU macro above moves to the next char, we want to point to the last // char consumed. (*begin)--; // Validate the decoded value. if (CBU_IS_UNICODE_CHAR(code_point)) return true; *code_point_out = kUnicodeReplacementCharacter; return false; }
void TruncateUTF8ToByteSize(const std::string& input, const size_t byte_size, std::string* output) { DCHECK(output); if(byte_size > input.length()) { *output = input; return; } DCHECK_LE(byte_size, static_cast<uint32>(kint32max)); // 注意: 由于CBU8_NEXT使用的是int32. int32 truncation_length = static_cast<int32>(byte_size); int32 char_index = truncation_length - 1; const char* data = input.data(); // 使用CBU8, 从阶段点由后向前移动查找合法的UTF8字符. 一旦找到 // 一个UTF8字符, 截断到该字符作为输出. while(char_index >= 0) { int32 prev = char_index; uint32 code_point = 0; CBU8_NEXT(data, char_index, truncation_length, code_point); if(!base::IsValidCharacter(code_point) || !base::IsValidCodepoint(code_point)) { char_index = prev - 1; } else { break; } } if(char_index >= 0) { *output = input.substr(0, char_index); } else { output->clear(); } }
void TruncateUTF8ToByteSize(const std::string& input, const size_t byte_size, std::string* output) { DCHECK(output); if (byte_size > input.length()) { *output = input; return; } DCHECK_LE(byte_size, static_cast<uint32>(kint32max)); // Note: This cast is necessary because CBU8_NEXT uses int32s. int32 truncation_length = static_cast<int32>(byte_size); int32 char_index = truncation_length - 1; const char* data = input.data(); // Using CBU8, we will move backwards from the truncation point // to the beginning of the string looking for a valid UTF8 // character. Once a full UTF8 character is found, we will // truncate the string to the end of that character. while (char_index >= 0) { int32 prev = char_index; base_icu::UChar32 code_point = 0; CBU8_NEXT(data, char_index, truncation_length, code_point); if (!IsValidCharacter(code_point) || !IsValidCodepoint(code_point)) { char_index = prev - 1; } else { break; } } if (char_index >= 0) *output = input.substr(0, char_index); else output->clear(); }