예제 #1
0
void
StandardTokenizer_tokenize_str(StandardTokenizer *self, const char *text,
                               size_t len, Inversion *inversion) {
    if ((len >= 1 && (uint8_t)text[len - 1] >= 0xC0)
        ||  (len >= 2 && (uint8_t)text[len - 2] >= 0xE0)
        ||  (len >= 3 && (uint8_t)text[len - 3] >= 0xF0)) {
        THROW(ERR, "Invalid UTF-8 sequence");
    }

    lucy_StringIter iter = { 0, 0 };

    while (iter.byte_pos < len) {
        int wb = S_wb_lookup(text + iter.byte_pos);

        while (wb >= WB_ASingle && wb <= WB_ExtendNumLet) {
            if (wb == WB_ASingle) {
                wb = S_parse_single(text, len, &iter, inversion);
            }
            else {
                wb = S_parse_word(text, len, &iter, wb, inversion);
            }
            if (iter.byte_pos >= len) return;
        }

        S_iter_advance(text, &iter);
    }
}
예제 #2
0
/*
 * Advances the iterator skipping over Extend and Format characters.
 * Returns the word break property of the current character.
 */
static int
S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter) {
    int wb = -1;

    do {
        S_iter_advance(text, iter);
        if (iter->byte_pos >= len) { break; }
        wb = S_wb_lookup(text + iter->byte_pos);
    } while (wb == WB_Extend_Format);

    return wb;
}
예제 #3
0
/*
 * Parse a word starting with an ALetter, Numeric or Katakana character.
 * Advances the iterator and returns the word break property of the current
 * character.
 */
static int
S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
             int state, Inversion *inversion) {
    int wb = -1;
    lucy_StringIter start = *iter;
    S_iter_advance(text, iter);
    lucy_StringIter end = *iter;

    while (iter->byte_pos < len) {
        wb = S_wb_lookup(text + iter->byte_pos);

        switch (wb) {
            case WB_ALetter:
            case WB_Numeric:
                if (state == WB_Katakana) { goto word_break; }
                break;
            case WB_Katakana:
                if (state == WB_ALetter || state == WB_Numeric) {
                    goto word_break;
                }
                break;
            case WB_ExtendNumLet:
                break;
            case WB_Extend_Format:
                // keep state
                wb = state;
                break;
            case WB_MidNumLet:
            case WB_MidLetter:
            case WB_MidNum:
                if ((state == WB_ALetter && wb != WB_MidNum)
                    ||  (state == WB_Numeric && wb != WB_MidLetter)) {
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) { break; }
                }
                goto word_break;
            default:
                goto word_break;
        }

        state = wb;
        S_iter_advance(text, iter);
        end = *iter;
    }

    Token *token;
word_break:
    token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos,
                      start.char_pos, end.char_pos, 1.0f, 1);
    Inversion_Append(inversion, token);

    return wb;
}
예제 #4
0
/*
 * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet
 * character. Advances the iterator and returns the word break property of the
 * current character.
 */
static int
S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
             int state, Inversion *inversion) {
    int wb = -1;
    lucy_StringIter start = *iter;
    S_iter_advance(text, iter);
    lucy_StringIter end = *iter;

    while (iter->byte_pos < len) {
        wb = S_wb_lookup(text + iter->byte_pos);

        switch (wb) {
            case WB_ALetter:
            case WB_Hebrew_Letter:
            case WB_Numeric:
                if (state == WB_Katakana) { goto word_break; }
                // Rules WB5, WB8, WB9, WB10, and WB13b.
                break;
            case WB_Katakana:
                if (state != WB_Katakana && state != WB_ExtendNumLet) {
                    goto word_break;
                }
                // Rules WB13 and WB13b.
                break;
            case WB_ExtendNumLet:
                // Rule WB13a.
                break;
            case WB_Extend_Format:
                // Rule WB4. Keep state.
                wb = state;
                break;
            case WB_Single_Quote:
            case WB_MidNumLet:
            case WB_MidLetter:
            case WB_MidNum:
                if (state == WB_ALetter) {
                    if (wb == WB_MidNum) { goto word_break; }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == WB_ALetter || wb == WB_Hebrew_Letter) {
                        // Rules WB6 and WB7.
                        state = wb;
                        break;
                    }
                }
                else if (state == WB_Hebrew_Letter) {
                    if (wb == WB_MidNum) { goto word_break; }
                    if (wb == WB_Single_Quote) {
                        // Rule WB7a.
                        ++end.byte_pos;
                        ++end.char_pos;
                    }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == WB_ALetter || wb == WB_Hebrew_Letter) {
                        // Rules WB6 and WB7.
                        state = wb;
                        break;
                    }
                }
                else if (state == WB_Numeric) {
                    if (wb == WB_MidLetter) { goto word_break; }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) {
                        // Rules WB11 and WB12.
                        break;
                    }
                }
                goto word_break;
            case WB_Double_Quote:
                if (state == WB_Hebrew_Letter) {
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) {
                        // Rules WB7b and WB7c.
                        break;
                    }
                }
                goto word_break;
            default:
                goto word_break;
        }

        state = wb;
        S_iter_advance(text, iter);
        end = *iter;
    }

    Token *token;
word_break:
    token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos,
                      start.char_pos, end.char_pos, 1.0f, 1);
    Inversion_Append(inversion, token);

    return wb;
}