void MatchPost_Add_Inversion_To_Pool_IMP(MatchPosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); const size_t base_size = VTable_Get_Obj_Alloc_Size(RAWPOSTING); Token **tokens; uint32_t freq; UNUSED_VAR(self); UNUSED_VAR(type); UNUSED_VAR(doc_boost); UNUSED_VAR(length_norm); Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { TokenIVARS *const token_ivars = Token_IVARS(*tokens); uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, token_ivars->len); RawPosting *raw_posting = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token_ivars->text, token_ivars->len); PostPool_Feed(post_pool, &raw_posting); } }
void MatchPost_add_inversion_to_pool(MatchPosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, i32_t doc_id, float doc_boost, float length_norm) { MemoryPool *mem_pool = post_pool->mem_pool; Token **tokens; u32_t freq; UNUSED_VAR(self); UNUSED_VAR(type); UNUSED_VAR(doc_boost); UNUSED_VAR(length_norm); Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; u32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len); RawPosting *raw_posting = RawPost_new( MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token->text, token->len ); PostPool_Add_Elem(post_pool, (Obj*)raw_posting); } }
RawPosting* RichPost_read_raw(RichPosting *self, InStream *instream, int32_t last_doc_id, CharBuf *term_text, MemoryPool *mem_pool) { char *const text_buf = (char*)CB_Get_Ptr8(term_text); const size_t text_size = CB_Get_Size(term_text); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t delta_doc = doc_code >> 1; const int32_t doc_id = last_doc_id + delta_doc; const uint32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size, freq); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); RawPosting *const raw_posting = RawPost_new(allocation, doc_id, freq, text_buf, text_size); uint32_t num_prox = freq; char *const start = raw_posting->blob + text_size; char * dest = start; UNUSED_VAR(self); // Read positions and per-position boosts. while (num_prox--) { dest += InStream_Read_Raw_C64(instream, dest); *((uint8_t*)dest) = InStream_Read_U8(instream); dest++; } // Resize raw posting memory allocation. raw_posting->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); return raw_posting; }
void ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { ScorePostingIVARS *const ivars = ScorePost_IVARS(self); MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); Similarity *sim = ivars->sim; float field_boost = doc_boost * FType_Get_Boost(type) * length_norm; const uint8_t field_boost_byte = Sim_Encode_Norm(sim, field_boost); const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING); Token **tokens; uint32_t freq; Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { TokenIVARS *const token_ivars = Token_IVARS(*tokens); uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq); RawPosting *raw_posting = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token_ivars->text, token_ivars->len); RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting); char *const start = raw_post_ivars->blob + token_ivars->len; char *dest = start; uint32_t last_prox = 0; // Field_boost. *((uint8_t*)dest) = field_boost_byte; dest++; // Positions. for (uint32_t i = 0; i < freq; i++) { TokenIVARS *const t_ivars = Token_IVARS(tokens[i]); const uint32_t prox_delta = t_ivars->pos - last_prox; NumUtil_encode_c32(prox_delta, &dest); last_prox = t_ivars->pos; } // Resize raw posting memory allocation. raw_post_ivars->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); PostPool_Feed(post_pool, (Obj*)raw_posting); } }
RawPosting* MatchPost_read_raw(MatchPosting *self, InStream *instream, i32_t last_doc_id, CharBuf *term_text, MemoryPool *mem_pool) { const size_t text_size = CB_Get_Size(term_text); const u32_t doc_code = InStream_Read_C32(instream); const u32_t delta_doc = doc_code >> 1; const i32_t doc_id = last_doc_id + delta_doc; const u32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); UNUSED_VAR(self); return RawPost_new(allocation, doc_id, freq, term_text->ptr, text_size); }
void RichPost_add_inversion_to_pool(RichPosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); Similarity *sim = self->sim; float field_boost = doc_boost * FType_Get_Boost(type) * length_norm; Token **tokens; uint32_t freq; Inversion_Reset(inversion); while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { Token *token = *tokens; uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len, freq); RawPosting *raw_posting = RawPost_new( MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token->text, token->len ); char *const start = raw_posting->blob + token->len; char *dest = start; uint32_t last_prox = 0; uint32_t i; // Positions and boosts. for (i = 0; i < freq; i++) { Token *const t = tokens[i]; const uint32_t prox_delta = t->pos - last_prox; const float boost = field_boost * t->boost; NumUtil_encode_c32(prox_delta, &dest); last_prox = t->pos; *((uint8_t*)dest) = Sim_Encode_Norm(sim, boost); dest++; } // Resize raw posting memory allocation. raw_posting->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); PostPool_Feed(post_pool, &raw_posting); } }
RawPosting* MatchPost_Read_Raw_IMP(MatchPosting *self, InStream *instream, int32_t last_doc_id, String *term_text, MemoryPool *mem_pool) { const char *const text_buf = Str_Get_Ptr8(term_text); const size_t text_size = Str_Get_Size(term_text); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t delta_doc = doc_code >> 1; const int32_t doc_id = last_doc_id + delta_doc; const uint32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); const size_t base_size = VTable_Get_Obj_Alloc_Size(RAWPOSTING); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, text_size); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); UNUSED_VAR(self); return RawPost_new(allocation, doc_id, freq, text_buf, text_size); }
RawPosting* ScorePost_Read_Raw_IMP(ScorePosting *self, InStream *instream, int32_t last_doc_id, String *term_text, MemoryPool *mem_pool) { const char *const text_buf = Str_Get_Ptr8(term_text); const size_t text_size = Str_Get_Size(term_text); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t delta_doc = doc_code >> 1; const int32_t doc_id = last_doc_id + delta_doc; const uint32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, text_size, freq); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); RawPosting *const raw_posting = RawPost_new(allocation, doc_id, freq, text_buf, text_size); RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting); uint32_t num_prox = freq; char *const start = raw_post_ivars->blob + text_size; char *dest = start; UNUSED_VAR(self); // Field_boost. *((uint8_t*)dest) = InStream_Read_U8(instream); dest++; // Read positions. while (num_prox--) { dest += InStream_Read_Raw_C64(instream, dest); } // Resize raw posting memory allocation. raw_post_ivars->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); return raw_posting; }
static void S_write_terms_and_postings(PostingPool *self, PostingWriter *post_writer, OutStream *skip_stream) { PostingPoolIVARS *const ivars = PostPool_IVARS(self); TermInfo *const tinfo = TInfo_new(0); TermInfo *const skip_tinfo = TInfo_new(0); TermInfoIVARS *const tinfo_ivars = TInfo_IVARS(tinfo); TermInfoIVARS *const skip_tinfo_ivars = TInfo_IVARS(skip_tinfo); LexiconWriter *const lex_writer = ivars->lex_writer; SkipStepper *const skip_stepper = ivars->skip_stepper; SkipStepperIVARS *const skip_stepper_ivars = SkipStepper_IVARS(skip_stepper); int32_t last_skip_doc = 0; int64_t last_skip_filepos = 0; const int32_t skip_interval = Arch_Skip_Interval(Schema_Get_Architecture(ivars->schema)); // Prime heldover variables. RawPosting *posting = (RawPosting*)CERTIFY(PostPool_Fetch(self), RAWPOSTING); RawPostingIVARS *post_ivars = RawPost_IVARS(posting); CharBuf *last_term_text = CB_new_from_trusted_utf8(post_ivars->blob, post_ivars->content_len); const char *last_text_buf = CB_Get_Ptr8(last_term_text); uint32_t last_text_size = CB_Get_Size(last_term_text); SkipStepper_Set_ID_And_Filepos(skip_stepper, 0, 0); // Initialize sentinel to be used on the last iter, using an empty string // in order to make LexiconWriter Do The Right Thing. size_t sentinel_size = Class_Get_Obj_Alloc_Size(RAWPOSTING) + 20; // blob length + cushion char empty_string[] = ""; RawPosting *sentinel = RawPost_new(alloca(sentinel_size), 0, 1, empty_string, 0); while (1) { bool same_text_as_last = true; if (posting == NULL) { // On the last iter, use an empty string to make LexiconWriter // DTRT. posting = sentinel; post_ivars = RawPost_IVARS(posting); same_text_as_last = false; } else { // Compare once. if (post_ivars->content_len != last_text_size || memcmp(&post_ivars->blob, last_text_buf, last_text_size) != 0 ) { same_text_as_last = false; } } // If the term text changes, process the last term. if (!same_text_as_last) { // Hand off to LexiconWriter. LexWriter_Add_Term(lex_writer, (Obj*)last_term_text, tinfo); // Start each term afresh. TInfo_Reset(tinfo); PostWriter_Start_Term(post_writer, tinfo); // Init skip data in preparation for the next term. skip_stepper_ivars->doc_id = 0; skip_stepper_ivars->filepos = tinfo_ivars->post_filepos; last_skip_doc = 0; last_skip_filepos = tinfo_ivars->post_filepos; // Remember the term_text so we can write string diffs. CB_Mimic_Utf8(last_term_text, post_ivars->blob, post_ivars->content_len); last_text_buf = CB_Get_Ptr8(last_term_text); last_text_size = CB_Get_Size(last_term_text); } // Bail on last iter before writing invalid posting data. if (posting == sentinel) { break; } // Write posting data. PostWriter_Write_Posting(post_writer, posting); // Doc freq lags by one iter. tinfo_ivars->doc_freq++; // Write skip data. if (skip_stream != NULL && same_text_as_last && tinfo_ivars->doc_freq % skip_interval == 0 && tinfo_ivars->doc_freq != 0 ) { // If first skip group, save skip stream pos for term info. if (tinfo_ivars->doc_freq == skip_interval) { tinfo_ivars->skip_filepos = OutStream_Tell(skip_stream); } // Write deltas. last_skip_doc = skip_stepper_ivars->doc_id; last_skip_filepos = skip_stepper_ivars->filepos; skip_stepper_ivars->doc_id = post_ivars->doc_id; PostWriter_Update_Skip_Info(post_writer, skip_tinfo); skip_stepper_ivars->filepos = skip_tinfo_ivars->post_filepos; SkipStepper_Write_Record(skip_stepper, skip_stream, last_skip_doc, last_skip_filepos); } // Retrieve the next posting from the sort pool. // DECREF(posting); // No!! DON'T destroy!!! posting = (RawPosting*)PostPool_Fetch(self); post_ivars = RawPost_IVARS(posting); } // Clean up. DECREF(last_term_text); DECREF(skip_tinfo); DECREF(tinfo); }