SortFieldWriter* SortFieldWriter_init(SortFieldWriter *self, Schema *schema, Snapshot *snapshot, Segment *segment, PolyReader *polyreader, String *field, Counter *counter, size_t mem_thresh, OutStream *temp_ord_out, OutStream *temp_ix_out, OutStream *temp_dat_out) { // Init. SortEx_init((SortExternal*)self); SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); ivars->null_ord = -1; ivars->count = 0; ivars->ord_start = 0; ivars->ord_end = 0; ivars->ix_start = 0; ivars->ix_end = 0; ivars->dat_start = 0; ivars->dat_end = 0; ivars->run_cardinality = -1; ivars->run_max = -1; ivars->sort_cache = NULL; ivars->doc_map = NULL; ivars->sorted_ids = NULL; ivars->run_tick = 1; ivars->ord_width = 0; // Assign. ivars->field = Str_Clone(field); ivars->schema = (Schema*)INCREF(schema); ivars->snapshot = (Snapshot*)INCREF(snapshot); ivars->segment = (Segment*)INCREF(segment); ivars->polyreader = (PolyReader*)INCREF(polyreader); ivars->counter = (Counter*)INCREF(counter); ivars->temp_ord_out = (OutStream*)INCREF(temp_ord_out); ivars->temp_ix_out = (OutStream*)INCREF(temp_ix_out); ivars->temp_dat_out = (OutStream*)INCREF(temp_dat_out); ivars->mem_thresh = mem_thresh; // Derive. ivars->field_num = Seg_Field_Num(segment, field); FieldType *type = (FieldType*)CERTIFY( Schema_Fetch_Type(ivars->schema, field), FIELDTYPE); ivars->type = (FieldType*)INCREF(type); ivars->prim_id = FType_Primitive_ID(type); ivars->mem_per_entry = Class_Get_Obj_Alloc_Size(SFWRITERELEM); if (ivars->prim_id == FType_TEXT) { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(STRING); ivars->var_width = true; } else if (ivars->prim_id == FType_BLOB) { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(BLOB); ivars->var_width = true; } else { ivars->mem_per_entry += Class_Get_Obj_Alloc_Size(FLOAT); ivars->var_width = false; } return self; }
void ScorePost_Add_Inversion_To_Pool_IMP(ScorePosting *self, PostingPool *post_pool, Inversion *inversion, FieldType *type, int32_t doc_id, float doc_boost, float length_norm) { ScorePostingIVARS *const ivars = ScorePost_IVARS(self); MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool); Similarity *sim = ivars->sim; float field_boost = doc_boost * FType_Get_Boost(type) * length_norm; const uint8_t field_boost_byte = Sim_Encode_Norm(sim, field_boost); const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING); Token **tokens; uint32_t freq; Inversion_Reset(inversion); while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { TokenIVARS *const token_ivars = Token_IVARS(*tokens); uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, token_ivars->len, freq); RawPosting *raw_posting = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id, freq, token_ivars->text, token_ivars->len); RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting); char *const start = raw_post_ivars->blob + token_ivars->len; char *dest = start; uint32_t last_prox = 0; // Field_boost. *((uint8_t*)dest) = field_boost_byte; dest++; // Positions. for (uint32_t i = 0; i < freq; i++) { TokenIVARS *const t_ivars = Token_IVARS(tokens[i]); const uint32_t prox_delta = t_ivars->pos - last_prox; NumUtil_encode_c32(prox_delta, &dest); last_prox = t_ivars->pos; } // Resize raw posting memory allocation. raw_post_ivars->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); PostPool_Feed(post_pool, (Obj*)raw_posting); } }
RawPosting* ScorePost_Read_Raw_IMP(ScorePosting *self, InStream *instream, int32_t last_doc_id, String *term_text, MemoryPool *mem_pool) { const char *const text_buf = Str_Get_Ptr8(term_text); const size_t text_size = Str_Get_Size(term_text); const uint32_t doc_code = InStream_Read_C32(instream); const uint32_t delta_doc = doc_code >> 1; const int32_t doc_id = last_doc_id + delta_doc; const uint32_t freq = (doc_code & 1) ? 1 : InStream_Read_C32(instream); const size_t base_size = Class_Get_Obj_Alloc_Size(RAWPOSTING); size_t raw_post_bytes = MAX_RAW_POSTING_LEN(base_size, text_size, freq); void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); RawPosting *const raw_posting = RawPost_new(allocation, doc_id, freq, text_buf, text_size); RawPostingIVARS *const raw_post_ivars = RawPost_IVARS(raw_posting); uint32_t num_prox = freq; char *const start = raw_post_ivars->blob + text_size; char *dest = start; UNUSED_VAR(self); // Field_boost. *((uint8_t*)dest) = InStream_Read_U8(instream); dest++; // Read positions. while (num_prox--) { dest += InStream_Read_Raw_C64(instream, dest); } // Resize raw posting memory allocation. raw_post_ivars->aux_len = dest - start; raw_post_bytes = dest - (char*)raw_posting; MemPool_Resize(mem_pool, raw_posting, raw_post_bytes); return raw_posting; }
static void S_write_terms_and_postings(PostingPool *self, PostingWriter *post_writer, OutStream *skip_stream) { PostingPoolIVARS *const ivars = PostPool_IVARS(self); TermInfo *const tinfo = TInfo_new(0); TermInfo *const skip_tinfo = TInfo_new(0); TermInfoIVARS *const tinfo_ivars = TInfo_IVARS(tinfo); TermInfoIVARS *const skip_tinfo_ivars = TInfo_IVARS(skip_tinfo); LexiconWriter *const lex_writer = ivars->lex_writer; SkipStepper *const skip_stepper = ivars->skip_stepper; SkipStepperIVARS *const skip_stepper_ivars = SkipStepper_IVARS(skip_stepper); int32_t last_skip_doc = 0; int64_t last_skip_filepos = 0; const int32_t skip_interval = Arch_Skip_Interval(Schema_Get_Architecture(ivars->schema)); // Prime heldover variables. RawPosting *posting = (RawPosting*)CERTIFY(PostPool_Fetch(self), RAWPOSTING); RawPostingIVARS *post_ivars = RawPost_IVARS(posting); CharBuf *last_term_text = CB_new_from_trusted_utf8(post_ivars->blob, post_ivars->content_len); const char *last_text_buf = CB_Get_Ptr8(last_term_text); uint32_t last_text_size = CB_Get_Size(last_term_text); SkipStepper_Set_ID_And_Filepos(skip_stepper, 0, 0); // Initialize sentinel to be used on the last iter, using an empty string // in order to make LexiconWriter Do The Right Thing. size_t sentinel_size = Class_Get_Obj_Alloc_Size(RAWPOSTING) + 20; // blob length + cushion char empty_string[] = ""; RawPosting *sentinel = RawPost_new(alloca(sentinel_size), 0, 1, empty_string, 0); while (1) { bool same_text_as_last = true; if (posting == NULL) { // On the last iter, use an empty string to make LexiconWriter // DTRT. posting = sentinel; post_ivars = RawPost_IVARS(posting); same_text_as_last = false; } else { // Compare once. if (post_ivars->content_len != last_text_size || memcmp(&post_ivars->blob, last_text_buf, last_text_size) != 0 ) { same_text_as_last = false; } } // If the term text changes, process the last term. if (!same_text_as_last) { // Hand off to LexiconWriter. LexWriter_Add_Term(lex_writer, (Obj*)last_term_text, tinfo); // Start each term afresh. TInfo_Reset(tinfo); PostWriter_Start_Term(post_writer, tinfo); // Init skip data in preparation for the next term. skip_stepper_ivars->doc_id = 0; skip_stepper_ivars->filepos = tinfo_ivars->post_filepos; last_skip_doc = 0; last_skip_filepos = tinfo_ivars->post_filepos; // Remember the term_text so we can write string diffs. CB_Mimic_Utf8(last_term_text, post_ivars->blob, post_ivars->content_len); last_text_buf = CB_Get_Ptr8(last_term_text); last_text_size = CB_Get_Size(last_term_text); } // Bail on last iter before writing invalid posting data. if (posting == sentinel) { break; } // Write posting data. PostWriter_Write_Posting(post_writer, posting); // Doc freq lags by one iter. tinfo_ivars->doc_freq++; // Write skip data. if (skip_stream != NULL && same_text_as_last && tinfo_ivars->doc_freq % skip_interval == 0 && tinfo_ivars->doc_freq != 0 ) { // If first skip group, save skip stream pos for term info. if (tinfo_ivars->doc_freq == skip_interval) { tinfo_ivars->skip_filepos = OutStream_Tell(skip_stream); } // Write deltas. last_skip_doc = skip_stepper_ivars->doc_id; last_skip_filepos = skip_stepper_ivars->filepos; skip_stepper_ivars->doc_id = post_ivars->doc_id; PostWriter_Update_Skip_Info(post_writer, skip_tinfo); skip_stepper_ivars->filepos = skip_tinfo_ivars->post_filepos; SkipStepper_Write_Record(skip_stepper, skip_stream, last_skip_doc, last_skip_filepos); } // Retrieve the next posting from the sort pool. // DECREF(posting); // No!! DON'T destroy!!! posting = (RawPosting*)PostPool_Fetch(self); post_ivars = RawPost_IVARS(posting); } // Clean up. DECREF(last_term_text); DECREF(skip_tinfo); DECREF(tinfo); }