Пример #1
0
/*
 * Parse a word starting with an ALetter, Numeric or Katakana character.
 * Advances the iterator and returns the word break property of the current
 * character.
 */
static int
S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
             int state, Inversion *inversion) {
    int wb = -1;
    lucy_StringIter start = *iter;
    S_iter_advance(text, iter);
    lucy_StringIter end = *iter;

    while (iter->byte_pos < len) {
        wb = S_wb_lookup(text + iter->byte_pos);

        switch (wb) {
            case WB_ALetter:
            case WB_Numeric:
                if (state == WB_Katakana) { goto word_break; }
                break;
            case WB_Katakana:
                if (state == WB_ALetter || state == WB_Numeric) {
                    goto word_break;
                }
                break;
            case WB_ExtendNumLet:
                break;
            case WB_Extend_Format:
                // keep state
                wb = state;
                break;
            case WB_MidNumLet:
            case WB_MidLetter:
            case WB_MidNum:
                if ((state == WB_ALetter && wb != WB_MidNum)
                    ||  (state == WB_Numeric && wb != WB_MidLetter)) {
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) { break; }
                }
                goto word_break;
            default:
                goto word_break;
        }

        state = wb;
        S_iter_advance(text, iter);
        end = *iter;
    }

    Token *token;
word_break:
    token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos,
                      start.char_pos, end.char_pos, 1.0f, 1);
    Inversion_Append(inversion, token);

    return wb;
}
Пример #2
0
/*
 * Parse a word consisting of a single codepoint followed by extend or
 * format characters. Used for Alphabetic characters that don't have the
 * ALetter word break property: ideographs, Hiragana, and "complex content".
 * Advances the iterator and returns the word break property of the current
 * character.
 */
static int
S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
               Inversion *inversion) {
    lucy_StringIter start = *iter;
    int wb = S_skip_extend_format(text, len, iter);

    Token *token = Token_new(text + start.byte_pos,
                             iter->byte_pos - start.byte_pos,
                             start.char_pos, iter->char_pos, 1.0f, 1);
    Inversion_Append(inversion, token);

    return wb;
}
void
WhitespaceTokenizer_Tokenize_Str_IMP(WhitespaceTokenizer *self,
                                     const char *text, size_t len,
                                     Inversion *inversion) {
    size_t byte_pos       = 0;
    size_t char_pos       = 0;
    size_t start_byte_pos = 0;
    size_t start_char_pos = 0;
    int    prev_ws        = 1;

    while (byte_pos < len) {
        uint32_t cp = StrHelp_decode_utf8_char(text + byte_pos);
        int      ws = isspace(cp);

        if (prev_ws && !ws) {
            start_byte_pos = byte_pos;
            start_char_pos = char_pos;
        }
        else if (!prev_ws && ws) {
            Token *token = Token_new(text + start_byte_pos,
                                     byte_pos - start_byte_pos,
                                     start_char_pos, char_pos, 1.0f, 1);
            Inversion_Append(inversion, token);
        }

        prev_ws = ws;
        byte_pos += StrHelp_UTF8_COUNT[(uint8_t)text[byte_pos]];
        char_pos += 1;
    }

    if (!prev_ws) {
        Token *token = Token_new(text + start_byte_pos,
                                 byte_pos - start_byte_pos,
                                 start_char_pos, char_pos, 1.0f, 1);
        Inversion_Append(inversion, token);
    }
}
Пример #4
0
Inversion*
Stopalizer_transform(Stopalizer *self, Inversion *inversion)
{
    Token *token;
    Inversion *new_inversion = Inversion_new(NULL);
    Hash *const stoplist  = self->stoplist;

    while (NULL != (token = Inversion_Next(inversion))) {
        if (!Hash_Fetch_Str(stoplist, token->text, token->len)) {
            Inversion_Append(new_inversion, (Token*)INCREF(token));
        }
    }

    return new_inversion;
}
Пример #5
0
Inversion*
SnowStop_Transform_IMP(SnowballStopFilter *self, Inversion *inversion) {
    Token *token;
    Inversion *new_inversion = Inversion_new(NULL);
    SnowballStopFilterIVARS *const ivars = SnowStop_IVARS(self);
    Hash *const stoplist  = ivars->stoplist;

    while (NULL != (token = Inversion_Next(inversion))) {
        TokenIVARS *const token_ivars = Token_IVARS(token);
        if (!Hash_Fetch_Utf8(stoplist, token_ivars->text, token_ivars->len)) {
            Inversion_Append(new_inversion, (Token*)INCREF(token));
        }
    }

    return new_inversion;
}
Пример #6
0
Inversion*
Inversion_new(Token *seed_token) {
    Inversion *self = (Inversion*)Class_Make_Obj(INVERSION);
    InversionIVARS *const ivars = Inversion_IVARS(self);

    // Init.
    ivars->cap                 = 16;
    ivars->size                = 0;
    ivars->tokens              = (Token**)CALLOCATE(ivars->cap, sizeof(Token*));
    ivars->cur                 = 0;
    ivars->inverted            = false;
    ivars->cluster_counts      = NULL;
    ivars->cluster_counts_size = 0;

    // Process the seed token.
    if (seed_token != NULL) {
        Inversion_Append(self, (Token*)INCREF(seed_token));
    }

    return self;
}
Пример #7
0
/*
 * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet
 * character. Advances the iterator and returns the word break property of the
 * current character.
 */
static int
S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
             int state, Inversion *inversion) {
    int wb = -1;
    lucy_StringIter start = *iter;
    S_iter_advance(text, iter);
    lucy_StringIter end = *iter;

    while (iter->byte_pos < len) {
        wb = S_wb_lookup(text + iter->byte_pos);

        switch (wb) {
            case WB_ALetter:
            case WB_Hebrew_Letter:
            case WB_Numeric:
                if (state == WB_Katakana) { goto word_break; }
                // Rules WB5, WB8, WB9, WB10, and WB13b.
                break;
            case WB_Katakana:
                if (state != WB_Katakana && state != WB_ExtendNumLet) {
                    goto word_break;
                }
                // Rules WB13 and WB13b.
                break;
            case WB_ExtendNumLet:
                // Rule WB13a.
                break;
            case WB_Extend_Format:
                // Rule WB4. Keep state.
                wb = state;
                break;
            case WB_Single_Quote:
            case WB_MidNumLet:
            case WB_MidLetter:
            case WB_MidNum:
                if (state == WB_ALetter) {
                    if (wb == WB_MidNum) { goto word_break; }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == WB_ALetter || wb == WB_Hebrew_Letter) {
                        // Rules WB6 and WB7.
                        state = wb;
                        break;
                    }
                }
                else if (state == WB_Hebrew_Letter) {
                    if (wb == WB_MidNum) { goto word_break; }
                    if (wb == WB_Single_Quote) {
                        // Rule WB7a.
                        ++end.byte_pos;
                        ++end.char_pos;
                    }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == WB_ALetter || wb == WB_Hebrew_Letter) {
                        // Rules WB6 and WB7.
                        state = wb;
                        break;
                    }
                }
                else if (state == WB_Numeric) {
                    if (wb == WB_MidLetter) { goto word_break; }
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) {
                        // Rules WB11 and WB12.
                        break;
                    }
                }
                goto word_break;
            case WB_Double_Quote:
                if (state == WB_Hebrew_Letter) {
                    wb = S_skip_extend_format(text, len, iter);
                    if (wb == state) {
                        // Rules WB7b and WB7c.
                        break;
                    }
                }
                goto word_break;
            default:
                goto word_break;
        }

        state = wb;
        S_iter_advance(text, iter);
        end = *iter;
    }

    Token *token;
word_break:
    token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos,
                      start.char_pos, end.char_pos, 1.0f, 1);
    Inversion_Append(inversion, token);

    return wb;
}