void WhitespaceTokenizer_Tokenize_Str_IMP(WhitespaceTokenizer *self, const char *text, size_t len, Inversion *inversion) { size_t byte_pos = 0; size_t char_pos = 0; size_t start_byte_pos = 0; size_t start_char_pos = 0; int prev_ws = 1; while (byte_pos < len) { uint32_t cp = StrHelp_decode_utf8_char(text + byte_pos); int ws = isspace(cp); if (prev_ws && !ws) { start_byte_pos = byte_pos; start_char_pos = char_pos; } else if (!prev_ws && ws) { Token *token = Token_new(text + start_byte_pos, byte_pos - start_byte_pos, start_char_pos, char_pos, 1.0f, 1); Inversion_Append(inversion, token); } prev_ws = ws; byte_pos += StrHelp_UTF8_COUNT[(uint8_t)text[byte_pos]]; char_pos += 1; } if (!prev_ws) { Token *token = Token_new(text + start_byte_pos, byte_pos - start_byte_pos, start_char_pos, char_pos, 1.0f, 1); Inversion_Append(inversion, token); } }
static void test_utf8_round_trip(TestBatchRunner *runner) { int32_t code_point; for (code_point = 0; code_point <= 0x10FFFF; code_point++) { char buffer[4]; uint32_t size = StrHelp_encode_utf8_char(code_point, buffer); char *start = buffer; char *end = start + size; // Verify length returned by encode_utf8_char(). if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) { break; } // Verify that utf8_valid() agrees with alternate implementation. if (!!StrHelp_utf8_valid(start, size) != !!S_utf8_valid_alt(start, size) ) { break; } // Verify back_utf8_char(). if (StrHelp_back_utf8_char(end, start) != start) { break; } // Verify round trip of encode/decode. if (StrHelp_decode_utf8_char(buffer) != code_point) { break; } } if (code_point == 0x110000) { PASS(runner, "Successfully round tripped 0 - 0x10FFFF"); } else { FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point); } }