Beispiel #1
0
Inversion*
StandardTokenizer_transform(StandardTokenizer *self, Inversion *inversion) {
    Inversion *new_inversion = Inversion_new(NULL);
    Token *token;

    while (NULL != (token = Inversion_Next(inversion))) {
        StandardTokenizer_Tokenize_Str(self, token->text, token->len,
                                       new_inversion);
    }

    return new_inversion;
}
Inversion*
WhitespaceTokenizer_Transform_IMP(WhitespaceTokenizer *self,
                                  Inversion *inversion) {
    Inversion *new_inversion = Inversion_new(NULL);
    Token *token;

    while (NULL != (token = Inversion_Next(inversion))) {
        WhitespaceTokenizer_Tokenize_Str(self, Token_Get_Text(token),
                                         Token_Get_Len(token), new_inversion);
    }

    return new_inversion;
}
Beispiel #3
0
Inversion*
RegexTokenizer_Transform_IMP(RegexTokenizer *self, Inversion *inversion) {
    Inversion *new_inversion = Inversion_new(NULL);
    Token *token;

    while (NULL != (token = Inversion_Next(inversion))) {
        TokenIVARS *const token_ivars = Token_IVARS(token);
        RegexTokenizer_Tokenize_Utf8(self, token_ivars->text, token_ivars->len,
                                     new_inversion);
    }

    return new_inversion;
}
Beispiel #4
0
Inversion*
Stopalizer_transform(Stopalizer *self, Inversion *inversion)
{
    Token *token;
    Inversion *new_inversion = Inversion_new(NULL);
    Hash *const stoplist  = self->stoplist;

    while (NULL != (token = Inversion_Next(inversion))) {
        if (!Hash_Fetch_Str(stoplist, token->text, token->len)) {
            Inversion_Append(new_inversion, (Token*)INCREF(token));
        }
    }

    return new_inversion;
}
Beispiel #5
0
Inversion*
SnowStop_Transform_IMP(SnowballStopFilter *self, Inversion *inversion) {
    Token *token;
    Inversion *new_inversion = Inversion_new(NULL);
    SnowballStopFilterIVARS *const ivars = SnowStop_IVARS(self);
    Hash *const stoplist  = ivars->stoplist;

    while (NULL != (token = Inversion_Next(inversion))) {
        TokenIVARS *const token_ivars = Token_IVARS(token);
        if (!Hash_Fetch_Utf8(stoplist, token_ivars->text, token_ivars->len)) {
            Inversion_Append(new_inversion, (Token*)INCREF(token));
        }
    }

    return new_inversion;
}
Beispiel #6
0
Inversion*
Normalizer_transform(Normalizer *self, Inversion *inversion) {
    // allocate additional space because utf8proc_reencode adds a
    // terminating null char
    int32_t static_buffer[INITIAL_BUFSIZE + 1];
    int32_t *buffer = static_buffer;
    ssize_t bufsize = INITIAL_BUFSIZE;
    Token *token;

    while (NULL != (token = Inversion_Next(inversion))) {
        ssize_t len = utf8proc_decompose((uint8_t*)token->text, token->len,
                                         buffer, bufsize, self->options);

        if (len > bufsize) {
            // buffer too small, (re)allocate
            if (buffer != static_buffer) {
                FREEMEM(buffer);
            }
            // allocate additional INITIAL_BUFSIZE items
            bufsize = len + INITIAL_BUFSIZE;
            buffer = (int32_t*)MALLOCATE((bufsize + 1) * sizeof(int32_t));
            len = utf8proc_decompose((uint8_t*)token->text, token->len,
                                     buffer, bufsize, self->options);
        }
        if (len < 0) {
            continue;
        }

        len = utf8proc_reencode(buffer, len, self->options);

        if (len >= 0) {
            if (len > (ssize_t)token->len) {
                FREEMEM(token->text);
                token->text = (char*)MALLOCATE(len + 1);
            }
            memcpy(token->text, buffer, len + 1);
            token->len = len;
        }
    }

    if (buffer != static_buffer) {
        FREEMEM(buffer);
    }

    Inversion_Reset(inversion);
    return (Inversion*)INCREF(inversion);
}
Inversion*
SnowStemmer_Transform_IMP(SnowballStemmer *self, Inversion *inversion) {
    Token *token;
    SnowballStemmerIVARS *const ivars = SnowStemmer_IVARS(self);
    struct sb_stemmer *const snowstemmer
        = (struct sb_stemmer*)ivars->snowstemmer;

    while (NULL != (token = Inversion_Next(inversion))) {
        TokenIVARS *const token_ivars = Token_IVARS(token);
        const sb_symbol *stemmed_text
            = sb_stemmer_stem(snowstemmer, (sb_symbol*)token_ivars->text,
                              token_ivars->len);
        size_t len = sb_stemmer_length(snowstemmer);
        if (len > token_ivars->len) {
            FREEMEM(token_ivars->text);
            token_ivars->text = (char*)MALLOCATE(len + 1);
        }
        memcpy(token_ivars->text, stemmed_text, len + 1);
        token_ivars->len = len;
    }
    Inversion_Reset(inversion);
    return (Inversion*)INCREF(inversion);
}