void MatchPost_Add_Inversion_To_Pool_IMP(MatchPosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); const size_t base_size = VTable_Get_Obj_Alloc_Size(RAWPOSTING); Token **tokens; uint32_t freq; UNUSED_VAR(self); UNUSED_VAR(type); UNUSED_VAR(doc_boost); UNUSED_VAR(length_norm); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { TokenIVARS *const token_ivars = Token_IVARS(*tokens); uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, token_ivars->len); RawPosting *raw_posting = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token_ivars->text, token_ivars->len); PostPool_Feed(post_pool, &raw_posting); } }
void MatchPost_add_inversion_to_pool(MatchPosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, i32_t doc_id, float doc_boost, float length_norm) { MemoryPool *mem_pool = post_pool->mem_pool; Token **tokens; u32_t freq; UNUSED_VAR(self); UNUSED_VAR(type); UNUSED_VAR(doc_boost); UNUSED_VAR(length_norm); Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; u32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len); RawPosting *raw_posting = RawPost_new( MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token->text, token->len ); PostPool_Add_Elem(post_pool, (Obj*)raw_posting); } }
Inversion* Normalizer_transform(Normalizer *self, Inversion *inversion) { // allocate additional space because utf8proc_reencode adds a // terminating null char int32_t static_buffer[INITIAL_BUFSIZE + 1]; int32_t *buffer = static_buffer; ssize_t bufsize = INITIAL_BUFSIZE; Token *token; while (NULL != (token = Inversion_Next(inversion))) { ssize_t len = utf8proc_decompose((uint8_t*)token->text, token->len, buffer, bufsize, self->options); if (len > bufsize) { // buffer too small, (re)allocate if (buffer != static_buffer) { FREEMEM(buffer); } // allocate additional INITIAL_BUFSIZE items bufsize = len + INITIAL_BUFSIZE; buffer = (int32_t*)MALLOCATE((bufsize + 1) * sizeof(int32_t)); len = utf8proc_decompose((uint8_t*)token->text, token->len, buffer, bufsize, self->options); } if (len < 0) { continue; } len = utf8proc_reencode(buffer, len, self->options); if (len >= 0) { if (len > (ssize_t)token->len) { FREEMEM(token->text); token->text = (char*)MALLOCATE(len + 1); } memcpy(token->text, buffer, len + 1); token->len = len; } } if (buffer != static_buffer) { FREEMEM(buffer); } Inversion_Reset(inversion); return (Inversion*)INCREF(inversion); }
void ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { ScorePostingIVARS *const ivars = ScorePost_IVARS(self); MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); Similarity *sim = ivars->sim; float field_boost = doc_boost * FType_Get_Boost(type) * length_norm; const uint8_t field_boost_byte = Sim_Encode_Norm(sim, field_boost); const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING); Token **tokens; uint32_t freq; Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { TokenIVARS *const token_ivars = Token_IVARS(*tokens); uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq); RawPosting *raw_posting = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token_ivars->text, token_ivars->len); RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting); char *const start = raw_post_ivars->blob + token_ivars->len; char *dest = start; uint32_t last_prox = 0; // Field_boost. *((uint8_t*)dest) = field_boost_byte; dest++; // Positions. for (uint32_t i = 0; i < freq; i++) { TokenIVARS *const t_ivars = Token_IVARS(tokens[i]); const uint32_t prox_delta = t_ivars->pos - last_prox; NumUtil_encode_c32(prox_delta, &dest); last_prox = t_ivars->pos; } // Resize raw posting memory allocation. raw_post_ivars->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); PostPool_Feed(post_pool, (Obj*)raw_posting); } }
void RichPost_add_inversion_to_pool(RichPosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); Similarity *sim = self->sim; float field_boost = doc_boost * FType_Get_Boost(type) * length_norm; Token **tokens; uint32_t freq; Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len, freq); RawPosting *raw_posting = RawPost_new( MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token->text, token->len ); char *const start = raw_posting->blob + token->len; char *dest = start; uint32_t last_prox = 0; uint32_t i; // Positions and boosts. for (i = 0; i < freq; i++) { Token *const t = tokens[i]; const uint32_t prox_delta = t->pos - last_prox; const float boost = field_boost * t->boost; NumUtil_encode_c32(prox_delta, &dest); last_prox = t->pos; *((uint8_t*)dest) = Sim_Encode_Norm(sim, boost); dest++; } // Resize raw posting memory allocation. raw_posting->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); PostPool_Feed(post_pool, &raw_posting); } }
Inversion* SnowStemmer_Transform_IMP(SnowballStemmer *self, Inversion *inversion) { Token *token; SnowballStemmerIVARS *const ivars = SnowStemmer_IVARS(self); struct sb_stemmer *const snowstemmer = (struct sb_stemmer*)ivars->snowstemmer; while (NULL != (token = Inversion_Next(inversion))) { TokenIVARS *const token_ivars = Token_IVARS(token); const sb_symbol *stemmed_text = sb_stemmer_stem(snowstemmer, (sb_symbol*)token_ivars->text, token_ivars->len); size_t len = sb_stemmer_length(snowstemmer); if (len > token_ivars->len) { FREEMEM(token_ivars->text); token_ivars->text = (char*)MALLOCATE(len + 1); } memcpy(token_ivars->text, stemmed_text, len + 1); token_ivars->len = len; } Inversion_Reset(inversion); return (Inversion*)INCREF(inversion); }
ByteBuf* HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion *inversion) { const char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8); uint32_t num_postings = 0; Token **tokens; uint32_t freq; UNUSED_VAR(self); // Leave space for a c32 indicating the number of postings. BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { Token *token = *tokens; char *const token_text = Token_Get_Text(token); const int32_t token_len = Token_Get_Len(token); int32_t overlap = StrHelp_overlap(last_text, token_text, last_len, token_len); char *ptr; char *orig; size_t old_size = BB_Get_Size(tv_buf); size_t new_size = old_size + C32_MAX_BYTES // overlap + C32_MAX_BYTES // length of string diff + (token_len - overlap) // diff char data + C32_MAX_BYTES // num prox + (C32_MAX_BYTES * freq * 3); // pos data // Allocate for worst-case scenario. ptr = BB_Grow(tv_buf, new_size); orig = ptr; ptr += old_size; // Track number of postings. num_postings += 1; // Append the string diff to the tv_buf. NumUtil_encode_c32(overlap, &ptr); NumUtil_encode_c32((token_len - overlap), &ptr); memcpy(ptr, (token_text + overlap), (token_len - overlap)); ptr += token_len - overlap; // Save text and text_len for comparison next loop. last_text = token_text; last_len = token_len; // Append the number of positions for this term. NumUtil_encode_c32(freq, &ptr); do { // Add position, start_offset, and end_offset to tv_buf. NumUtil_encode_c32(Token_Get_Pos(token), &ptr); NumUtil_encode_c32(Token_Get_Start_Offset(token), &ptr); NumUtil_encode_c32(Token_Get_End_Offset(token), &ptr); } while (--freq && (token = *++tokens)); // Set new byte length. BB_Set_Size(tv_buf, ptr - orig); } // Go back and start the term vector string with the posting count. char *dest = BB_Get_Buf(tv_buf); NumUtil_encode_padded_c32(num_postings, &dest); return tv_buf; }
ByteBuf* HLWriter_tv_buf(HighlightWriter *self, Inversion *inversion) { char *last_text = ""; size_t last_len = 0; ByteBuf *tv_buf = BB_new(20 + inversion->size * 8); /* generous */ u32_t num_postings = 0; char *dest; Token **tokens; u32_t freq; UNUSED_VAR(self); /* heh. */ /* Leave space for a c32 indicating the number of postings. */ BB_Set_Size(tv_buf, C32_MAX_BYTES); Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; i32_t overlap = StrHelp_string_diff(last_text, token->text, last_len, token->len); char *ptr; size_t new_size = BB_Get_Size(tv_buf) + C32_MAX_BYTES /* overlap */ + C32_MAX_BYTES /* length of string diff */ + (token->len - overlap) /* diff char data */ + C32_MAX_BYTES /* num prox */ + (C32_MAX_BYTES * freq * 3); /* pos data */ /* Allocate for worst-case scenario. */ BB_Grow(tv_buf, new_size); ptr = BBEND(tv_buf); /* Track number of postings. */ num_postings += 1; /* Append the string diff to the tv_buf. */ Math_encode_c32(overlap, &ptr); Math_encode_c32( (token->len - overlap), &ptr); memcpy(ptr, (token->text + overlap), (token->len - overlap)); ptr += token->len - overlap; /* Save text and text_len for comparison next loop. */ last_text = token->text; last_len = token->len; /* Append the number of positions for this term. */ Math_encode_c32(freq, &ptr); do { /* Add position, start_offset, and end_offset to tv_buf. */ Math_encode_c32(token->pos, &ptr); Math_encode_c32(token->start_offset, &ptr); Math_encode_c32(token->end_offset, &ptr); } while (--freq && (token = *++tokens)); /* Set new byte length. */ BB_Set_Size(tv_buf, ptr - tv_buf->ptr); } /* Go back and start the term vector string with the number of postings. */ dest = tv_buf->ptr; Math_encode_padded_c32(num_postings, &dest); return tv_buf; }