Exemple #1
0
void
Inversion_Invert_IMP(Inversion *self) {
    InversionIVARS *const ivars = Inversion_IVARS(self);
    Token   **tokens = ivars->tokens;
    Token   **limit  = tokens + ivars->size;
    int32_t   token_pos = 0;

    // Thwart future attempts to append.
    if (ivars->inverted) {
        THROW(ERR, "Inversion has already been inverted");
    }
    ivars->inverted = true;

    // Assign token positions.
    for (; tokens < limit; tokens++) {
        TokenIVARS *const cur_token_ivars = Token_IVARS(*tokens);
        cur_token_ivars->pos = token_pos;
        token_pos = (int32_t)((uint32_t)token_pos
                              + (uint32_t)cur_token_ivars->pos_inc);
        if (token_pos < cur_token_ivars->pos) {
            THROW(ERR, "Token positions out of order: %i32 %i32",
                  cur_token_ivars->pos, token_pos);
        }
    }

    // Sort the tokens lexically, and hand off to cluster counting routine.
    qsort(ivars->tokens, ivars->size, sizeof(Token*), Token_compare);
    S_count_clusters(self, ivars);
}
Exemple #2
0
void
MatchPost_Add_Inversion_To_Pool_IMP(MatchPosting *self,
                                    PostingPool *post_pool,
                                    Inversion *inversion, FieldType *type,
                                    int32_t doc_id, float doc_boost,
                                    float length_norm) {
    MemoryPool  *mem_pool = PostPool_Get_Mem_Pool(post_pool);
    const size_t base_size = VTable_Get_Obj_Alloc_Size(RAWPOSTING);
    Token      **tokens;
    uint32_t     freq;

    UNUSED_VAR(self);
    UNUSED_VAR(type);
    UNUSED_VAR(doc_boost);
    UNUSED_VAR(length_norm);

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        TokenIVARS *const token_ivars = Token_IVARS(*tokens);
        uint32_t raw_post_bytes
            = MAX_RAW_POSTING_LEN(base_size, token_ivars->len);
        RawPosting *raw_posting
            = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id,
                          freq, token_ivars->text, token_ivars->len);
        PostPool_Feed(post_pool, &raw_posting);
    }
}
Exemple #3
0
void
ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self,
                                    PostingPool *post_pool,
                                    Inversion *inversion, FieldType *type,
                                    int32_t doc_id, float doc_boost,
                                    float length_norm) {
    ScorePostingIVARS *const ivars = ScorePost_IVARS(self);
    MemoryPool     *mem_pool = PostPool_Get_Mem_Pool(post_pool);
    Similarity     *sim = ivars->sim;
    float           field_boost = doc_boost * FType_Get_Boost(type) * length_norm;
    const uint8_t   field_boost_byte  = Sim_Encode_Norm(sim, field_boost);
    const size_t    base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING);
    Token         **tokens;
    uint32_t        freq;

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        TokenIVARS *const token_ivars = Token_IVARS(*tokens);
        uint32_t raw_post_bytes
            = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq);
        RawPosting *raw_posting
            = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id,
                          freq, token_ivars->text, token_ivars->len);
        RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting);
        char *const start  = raw_post_ivars->blob + token_ivars->len;
        char *dest         = start;
        uint32_t last_prox = 0;

        // Field_boost.
        *((uint8_t*)dest) = field_boost_byte;
        dest++;

        // Positions.
        for (uint32_t i = 0; i < freq; i++) {
            TokenIVARS *const t_ivars = Token_IVARS(tokens[i]);
            const uint32_t prox_delta = t_ivars->pos - last_prox;
            NumUtil_encode_c32(prox_delta, &dest);
            last_prox = t_ivars->pos;
        }

        // Resize raw posting memory allocation.
        raw_post_ivars->aux_len = dest - start;
        raw_post_bytes = dest - (char*)raw_posting;
        MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);
        PostPool_Feed(post_pool, (Obj*)raw_posting);
    }
}
Exemple #4
0
Inversion*
RegexTokenizer_Transform_IMP(RegexTokenizer *self, Inversion *inversion) {
    Inversion *new_inversion = Inversion_new(NULL);
    Token *token;

    while (NULL != (token = Inversion_Next(inversion))) {
        TokenIVARS *const token_ivars = Token_IVARS(token);
        RegexTokenizer_Tokenize_Utf8(self, token_ivars->text, token_ivars->len,
                                     new_inversion);
    }

    return new_inversion;
}
Exemple #5
0
static void
S_count_clusters(Inversion *self, InversionIVARS *ivars) {
    UNUSED_VAR(self);
    Token **tokens = ivars->tokens;
    uint32_t *counts
        = (uint32_t*)CALLOCATE(ivars->size + 1, sizeof(uint32_t));

    // Save the cluster counts.
    ivars->cluster_counts_size = ivars->size;
    ivars->cluster_counts = counts;

    for (uint32_t i = 0; i < ivars->size;) {
        TokenIVARS *const base_token_ivars = Token_IVARS(tokens[i]);
        char  *const base_text  = base_token_ivars->text;
        const size_t base_len   = base_token_ivars->len;
        uint32_t     j          = i + 1;

        // Iterate through tokens until text doesn't match.
        while (j < ivars->size) {
            TokenIVARS *const candidate_ivars = Token_IVARS(tokens[j]);
            if ((candidate_ivars->len == base_len)
                    && (memcmp(candidate_ivars->text, base_text, base_len) == 0)
               ) {
                j++;
            }
            else {
                break;
            }
        }

        // Record count at the position of the first token in the cluster.
        counts[i] = j - i;

        // Start the next loop at the next token we haven't seen.
        i = j;
    }
}
Inversion*
SnowStop_Transform_IMP(SnowballStopFilter *self, Inversion *inversion) {
    Token *token;
    Inversion *new_inversion = Inversion_new(NULL);
    SnowballStopFilterIVARS *const ivars = SnowStop_IVARS(self);
    Hash *const stoplist  = ivars->stoplist;

    while (NULL != (token = Inversion_Next(inversion))) {
        TokenIVARS *const token_ivars = Token_IVARS(token);
        if (!Hash_Fetch_Utf8(stoplist, token_ivars->text, token_ivars->len)) {
            Inversion_Append(new_inversion, (Token*)INCREF(token));
        }
    }

    return new_inversion;
}
Inversion*
SnowStemmer_Transform_IMP(SnowballStemmer *self, Inversion *inversion) {
    Token *token;
    SnowballStemmerIVARS *const ivars = SnowStemmer_IVARS(self);
    struct sb_stemmer *const snowstemmer
        = (struct sb_stemmer*)ivars->snowstemmer;

    while (NULL != (token = Inversion_Next(inversion))) {
        TokenIVARS *const token_ivars = Token_IVARS(token);
        const sb_symbol *stemmed_text
            = sb_stemmer_stem(snowstemmer, (sb_symbol*)token_ivars->text,
                              token_ivars->len);
        size_t len = sb_stemmer_length(snowstemmer);
        if (len > token_ivars->len) {
            FREEMEM(token_ivars->text);
            token_ivars->text = (char*)MALLOCATE(len + 1);
        }
        memcpy(token_ivars->text, stemmed_text, len + 1);
        token_ivars->len = len;
    }
    Inversion_Reset(inversion);
    return (Inversion*)INCREF(inversion);
}