void
WhitespaceTokenizer_Tokenize_Str_IMP(WhitespaceTokenizer *self,
                                     const char *text, size_t len,
                                     Inversion *inversion) {
    size_t byte_pos       = 0;
    size_t char_pos       = 0;
    size_t start_byte_pos = 0;
    size_t start_char_pos = 0;
    int    prev_ws        = 1;

    while (byte_pos < len) {
        uint32_t cp = StrHelp_decode_utf8_char(text + byte_pos);
        int      ws = isspace(cp);

        if (prev_ws && !ws) {
            start_byte_pos = byte_pos;
            start_char_pos = char_pos;
        }
        else if (!prev_ws && ws) {
            Token *token = Token_new(text + start_byte_pos,
                                     byte_pos - start_byte_pos,
                                     start_char_pos, char_pos, 1.0f, 1);
            Inversion_Append(inversion, token);
        }

        prev_ws = ws;
        byte_pos += StrHelp_UTF8_COUNT[(uint8_t)text[byte_pos]];
        char_pos += 1;
    }

    if (!prev_ws) {
        Token *token = Token_new(text + start_byte_pos,
                                 byte_pos - start_byte_pos,
                                 start_char_pos, char_pos, 1.0f, 1);
        Inversion_Append(inversion, token);
    }
}
예제 #2
0
static void
test_utf8_round_trip(TestBatchRunner *runner) {
    int32_t code_point;
    for (code_point = 0; code_point <= 0x10FFFF; code_point++) {
        char buffer[4];
        uint32_t size = StrHelp_encode_utf8_char(code_point, buffer);
        char *start = buffer;
        char *end   = start + size;

        // Verify length returned by encode_utf8_char().
        if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) {
            break;
        }
        // Verify that utf8_valid() agrees with alternate implementation.
        if (!!StrHelp_utf8_valid(start, size)
            != !!S_utf8_valid_alt(start, size)
           ) {
            break;
        }

        // Verify back_utf8_char().
        if (StrHelp_back_utf8_char(end, start) != start) {
            break;
        }

        // Verify round trip of encode/decode.
        if (StrHelp_decode_utf8_char(buffer) != code_point) {
            break;
        }
    }
    if (code_point == 0x110000) {
        PASS(runner, "Successfully round tripped 0 - 0x10FFFF");
    }
    else {
        FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point);
    }
}