void Inversion_Invert_IMP(Inversion *self) { InversionIVARS *const ivars = Inversion_IVARS(self); Token **tokens = ivars->tokens; Token **limit = tokens + ivars->size; int32_t token_pos = 0; // Thwart future attempts to append. if (ivars->inverted) { THROW(ERR, "Inversion has already been inverted"); } ivars->inverted = true; // Assign token positions. for (; tokens < limit; tokens++) { TokenIVARS *const cur_token_ivars = Token_IVARS(*tokens); cur_token_ivars->pos = token_pos; token_pos = (int32_t)((uint32_t)token_pos + (uint32_t)cur_token_ivars->pos_inc); if (token_pos < cur_token_ivars->pos) { THROW(ERR, "Token positions out of order: %i32 %i32", cur_token_ivars->pos, token_pos); } } // Sort the tokens lexically, and hand off to cluster counting routine. qsort(ivars->tokens, ivars->size, sizeof(Token*), Token_compare); S_count_clusters(self, ivars); }
void MatchPost_Add_Inversion_To_Pool_IMP(MatchPosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); const size_t base_size = VTable_Get_Obj_Alloc_Size(RAWPOSTING); Token **tokens; uint32_t freq; UNUSED_VAR(self); UNUSED_VAR(type); UNUSED_VAR(doc_boost); UNUSED_VAR(length_norm); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { TokenIVARS *const token_ivars = Token_IVARS(*tokens); uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, token_ivars->len); RawPosting *raw_posting = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token_ivars->text, token_ivars->len); PostPool_Feed(post_pool, &raw_posting); } }
void ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { ScorePostingIVARS *const ivars = ScorePost_IVARS(self); MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); Similarity *sim = ivars->sim; float field_boost = doc_boost * FType_Get_Boost(type) * length_norm; const uint8_t field_boost_byte = Sim_Encode_Norm(sim, field_boost); const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING); Token **tokens; uint32_t freq; Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { TokenIVARS *const token_ivars = Token_IVARS(*tokens); uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq); RawPosting *raw_posting = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token_ivars->text, token_ivars->len); RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting); char *const start = raw_post_ivars->blob + token_ivars->len; char *dest = start; uint32_t last_prox = 0; // Field_boost. *((uint8_t*)dest) = field_boost_byte; dest++; // Positions. for (uint32_t i = 0; i < freq; i++) { TokenIVARS *const t_ivars = Token_IVARS(tokens[i]); const uint32_t prox_delta = t_ivars->pos - last_prox; NumUtil_encode_c32(prox_delta, &dest); last_prox = t_ivars->pos; } // Resize raw posting memory allocation. raw_post_ivars->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); PostPool_Feed(post_pool, (Obj*)raw_posting); } }
Inversion* RegexTokenizer_Transform_IMP(RegexTokenizer *self, Inversion *inversion) { Inversion *new_inversion = Inversion_new(NULL); Token *token; while (NULL != (token = Inversion_Next(inversion))) { TokenIVARS *const token_ivars = Token_IVARS(token); RegexTokenizer_Tokenize_Utf8(self, token_ivars->text, token_ivars->len, new_inversion); } return new_inversion; }
static void S_count_clusters(Inversion *self, InversionIVARS *ivars) { UNUSED_VAR(self); Token **tokens = ivars->tokens; uint32_t *counts = (uint32_t*)CALLOCATE(ivars->size + 1, sizeof(uint32_t)); // Save the cluster counts. ivars->cluster_counts_size = ivars->size; ivars->cluster_counts = counts; for (uint32_t i = 0; i < ivars->size;) { TokenIVARS *const base_token_ivars = Token_IVARS(tokens[i]); char *const base_text = base_token_ivars->text; const size_t base_len = base_token_ivars->len; uint32_t j = i + 1; // Iterate through tokens until text doesn't match. while (j < ivars->size) { TokenIVARS *const candidate_ivars = Token_IVARS(tokens[j]); if ((candidate_ivars->len == base_len) && (memcmp(candidate_ivars->text, base_text, base_len) == 0) ) { j++; } else { break; } } // Record count at the position of the first token in the cluster. counts[i] = j - i; // Start the next loop at the next token we haven't seen. i = j; } }
Inversion* SnowStop_Transform_IMP(SnowballStopFilter *self, Inversion *inversion) { Token *token; Inversion *new_inversion = Inversion_new(NULL); SnowballStopFilterIVARS *const ivars = SnowStop_IVARS(self); Hash *const stoplist = ivars->stoplist; while (NULL != (token = Inversion_Next(inversion))) { TokenIVARS *const token_ivars = Token_IVARS(token); if (!Hash_Fetch_Utf8(stoplist, token_ivars->text, token_ivars->len)) { Inversion_Append(new_inversion, (Token*)INCREF(token)); } } return new_inversion; }
Inversion* SnowStemmer_Transform_IMP(SnowballStemmer *self, Inversion *inversion) { Token *token; SnowballStemmerIVARS *const ivars = SnowStemmer_IVARS(self); struct sb_stemmer *const snowstemmer = (struct sb_stemmer*)ivars->snowstemmer; while (NULL != (token = Inversion_Next(inversion))) { TokenIVARS *const token_ivars = Token_IVARS(token); const sb_symbol *stemmed_text = sb_stemmer_stem(snowstemmer, (sb_symbol*)token_ivars->text, token_ivars->len); size_t len = sb_stemmer_length(snowstemmer); if (len > token_ivars->len) { FREEMEM(token_ivars->text); token_ivars->text = (char*)MALLOCATE(len + 1); } memcpy(token_ivars->text, stemmed_text, len + 1); token_ivars->len = len; } Inversion_Reset(inversion); return (Inversion*)INCREF(inversion); }