PolyPostingList* PolyPList_init(PolyPostingList *self, const CharBuf *field, VArray *readers, I32Array *starts) { u32_t i; const u32_t num_readers = VA_Get_Size(readers); /* Init. */ self->tick = 0; self->current = NULL; /* Assign. */ self->field = CB_Clone(field); /* Get sub-posting_lists and assign offsets. */ self->sub_plists = VA_new(num_readers); for (i = 0; i < num_readers; i++) { PostingsReader *const post_reader = (PostingsReader*)ASSERT_IS_A( VA_Fetch(readers, i), POSTINGSREADER); i32_t offset = I32Arr_Get(starts, i); SegPostingList *sub_plist = (SegPostingList*)PostReader_Posting_List( post_reader, field, NULL); if (sub_plist) { ASSERT_IS_A(sub_plist, SEGPOSTINGLIST); SegPList_Set_Doc_Base(sub_plist, offset); VA_Push(self->sub_plists, (Obj*)sub_plist); } } self->num_subs = VA_Get_Size(self->sub_plists); return self; }
void Indexer_add_index(Indexer *self, Obj *index) { Folder *other_folder = NULL; IndexReader *reader = NULL; if (Obj_Is_A(index, FOLDER)) { other_folder = (Folder*)INCREF(index); } else if (Obj_Is_A(index, CHARBUF)) { other_folder = (Folder*)FSFolder_new((CharBuf*)index); } else { THROW(ERR, "Invalid type for 'index': %o", Obj_Get_Class_Name(index)); } reader = IxReader_open((Obj*)other_folder, NULL, NULL); if (reader == NULL) { THROW(ERR, "Index doesn't seem to contain any data"); } else { Schema *schema = self->schema; Schema *other_schema = IxReader_Get_Schema(reader); VArray *other_fields = Schema_All_Fields(other_schema); VArray *seg_readers = IxReader_Seg_Readers(reader); uint32_t i, max; // Validate schema compatibility and add fields. Schema_Eat(schema, other_schema); // Add fields to Segment. for (i = 0, max = VA_Get_Size(other_fields); i < max; i++) { CharBuf *other_field = (CharBuf*)VA_Fetch(other_fields, i); Seg_Add_Field(self->segment, other_field); } DECREF(other_fields); // Add all segments. for (i = 0, max = VA_Get_Size(seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(seg_readers, i); DeletionsReader *del_reader = (DeletionsReader*)SegReader_Fetch( seg_reader, VTable_Get_Name(DELETIONSREADER)); Matcher *deletions = del_reader ? DelReader_Iterator(del_reader) : NULL; I32Array *doc_map = DelWriter_Generate_Doc_Map(self->del_writer, deletions, SegReader_Doc_Max(seg_reader), (int32_t)Seg_Get_Count(self->segment) ); SegWriter_Add_Segment(self->seg_writer, seg_reader, doc_map); DECREF(deletions); DECREF(doc_map); } DECREF(seg_readers); } DECREF(reader); DECREF(other_folder); }
VArray* HeatMap_Flatten_Spans_IMP(HeatMap *self, VArray *spans) { const uint32_t num_spans = VA_Get_Size(spans); UNUSED_VAR(self); if (!num_spans) { return VA_new(0); } else { VArray *flattened = S_flattened_but_empty_spans(spans); const uint32_t num_raw_flattened = VA_Get_Size(flattened); // Iterate over each of the source spans, contributing their scores to // any destination span that falls within range. uint32_t dest_tick = 0; for (uint32_t i = 0; i < num_spans; i++) { Span *source_span = (Span*)VA_Fetch(spans, i); int32_t source_span_offset = Span_Get_Offset(source_span); int32_t source_span_len = Span_Get_Length(source_span); int32_t source_span_end = source_span_offset + source_span_len; // Get the location of the flattened span that shares the source // span's offset. for (; dest_tick < num_raw_flattened; dest_tick++) { Span *dest_span = (Span*)VA_Fetch(flattened, dest_tick); if (Span_Get_Offset(dest_span) == source_span_offset) { break; } } // Fill in scores. for (uint32_t j = dest_tick; j < num_raw_flattened; j++) { Span *dest_span = (Span*)VA_Fetch(flattened, j); if (Span_Get_Offset(dest_span) == source_span_end) { break; } else { float new_weight = Span_Get_Weight(dest_span) + Span_Get_Weight(source_span); Span_Set_Weight(dest_span, new_weight); } } } // Leave holes instead of spans that don't have any score. dest_tick = 0; for (uint32_t i = 0; i < num_raw_flattened; i++) { Span *span = (Span*)VA_Fetch(flattened, i); if (Span_Get_Weight(span)) { VA_Store(flattened, dest_tick++, INCREF(span)); } } VA_Excise(flattened, dest_tick, num_raw_flattened - dest_tick); return flattened; } }
bool_t Folder_delete_tree(Folder *self, const CharBuf *path) { Folder *enclosing_folder = Folder_Enclosing_Folder(self, path); // Don't allow Folder to delete itself. if (!path || !CB_Get_Size(path)) { return false; } if (enclosing_folder) { ZombieCharBuf *local = IxFileNames_local_part(path, ZCB_BLANK()); if (Folder_Local_Is_Directory(enclosing_folder, (CharBuf*)local)) { Folder *inner_folder = Folder_Local_Find_Folder(enclosing_folder, (CharBuf*)local); DirHandle *dh = Folder_Local_Open_Dir(inner_folder); if (dh) { VArray *files = VA_new(20); VArray *dirs = VA_new(20); CharBuf *entry = DH_Get_Entry(dh); while (DH_Next(dh)) { VA_Push(files, (Obj*)CB_Clone(entry)); if (DH_Entry_Is_Dir(dh) && !DH_Entry_Is_Symlink(dh)) { VA_Push(dirs, (Obj*)CB_Clone(entry)); } } for (uint32_t i = 0, max = VA_Get_Size(dirs); i < max; i++) { CharBuf *name = (CharBuf*)VA_Fetch(files, i); bool_t success = Folder_Delete_Tree(inner_folder, name); if (!success && Folder_Local_Exists(inner_folder, name)) { break; } } for (uint32_t i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *name = (CharBuf*)VA_Fetch(files, i); bool_t success = Folder_Local_Delete(inner_folder, name); if (!success && Folder_Local_Exists(inner_folder, name)) { break; } } DECREF(dirs); DECREF(files); DECREF(dh); } } return Folder_Local_Delete(enclosing_folder, (CharBuf*)local); } else { // Return failure if the entry wasn't there in the first place. return false; } }
VArray* IxManager_recycle(IndexManager *self, PolyReader *reader, DeletionsWriter *del_writer, int64_t cutoff, bool_t optimize) { VArray *seg_readers = PolyReader_Get_Seg_Readers(reader); VArray *candidates = VA_Gather(seg_readers, S_check_cutoff, &cutoff); VArray *recyclables = VA_new(VA_Get_Size(candidates)); const uint32_t num_candidates = VA_Get_Size(candidates); if (optimize) { DECREF(recyclables); return candidates; } // Sort by ascending size in docs, choose sparsely populated segments. VA_Sort(candidates, S_compare_doc_count, NULL); int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t)); for (uint32_t i = 0; i < num_candidates; i++) { SegReader *seg_reader = (SegReader*)CERTIFY( VA_Fetch(candidates, i), SEGREADER); counts[i] = SegReader_Doc_Count(seg_reader); } I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates); uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts); DECREF(doc_counts); // Move SegReaders to be recycled. for (uint32_t i = 0; i < threshold; i++) { VA_Store(recyclables, i, VA_Delete(candidates, i)); } // Find segments where at least 10% of all docs have been deleted. for (uint32_t i = threshold; i < num_candidates; i++) { SegReader *seg_reader = (SegReader*)VA_Delete(candidates, i); CharBuf *seg_name = SegReader_Get_Seg_Name(seg_reader); double doc_max = SegReader_Doc_Max(seg_reader); double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name); double del_proportion = num_deletions / doc_max; if (del_proportion >= 0.1) { VA_Push(recyclables, (Obj*)seg_reader); } else { DECREF(seg_reader); } } DECREF(candidates); return recyclables; }
void SortWriter_add_segment(SortWriter *self, SegReader *reader, I32Array *doc_map) { SortWriterIVARS *const ivars = SortWriter_IVARS(self); VArray *fields = Schema_All_Fields(ivars->schema); // Proceed field-at-a-time, rather than doc-at-a-time. for (uint32_t i = 0, max = VA_Get_Size(fields); i < max; i++) { CharBuf *field = (CharBuf*)VA_Fetch(fields, i); SortReader *sort_reader = (SortReader*)SegReader_Fetch( reader, VTable_Get_Name(SORTREADER)); SortCache *cache = sort_reader ? SortReader_Fetch_Sort_Cache(sort_reader, field) : NULL; if (cache) { int32_t field_num = Seg_Field_Num(ivars->segment, field); SortFieldWriter *field_writer = S_lazy_init_field_writer(self, field_num); SortFieldWriter_Add_Segment(field_writer, reader, doc_map, cache); ivars->flush_at_finish = true; } } DECREF(fields); }
void SortWriter_add_inverted_doc(SortWriter *self, Inverter *inverter, int32_t doc_id) { SortWriterIVARS *const ivars = SortWriter_IVARS(self); int32_t field_num; Inverter_Iterate(inverter); while (0 != (field_num = Inverter_Next(inverter))) { FieldType *type = Inverter_Get_Type(inverter); if (FType_Sortable(type)) { SortFieldWriter *field_writer = S_lazy_init_field_writer(self, field_num); SortFieldWriter_Add(field_writer, doc_id, Inverter_Get_Value(inverter)); } } // If our SortFieldWriters have collectively passed the memory threshold, // flush all of them, then release all unique values with a single action. if (MemPool_Get_Consumed(ivars->mem_pool) > ivars->mem_thresh) { for (uint32_t i = 0; i < VA_Get_Size(ivars->field_writers); i++) { SortFieldWriter *const field_writer = (SortFieldWriter*)VA_Fetch(ivars->field_writers, i); if (field_writer) { SortFieldWriter_Flush(field_writer); } } MemPool_Release_All(ivars->mem_pool); ivars->flush_at_finish = true; } }
static void S_read_fsfolder(RAMFolder *self) { u32_t i, max; /* Open an FSFolder for reading. */ FSFolder *source_folder = FSFolder_new(self->path); VArray *files = FSFolder_List(source_folder); /* Copy every file in the FSFolder into RAM. */ for (i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *filepath = (CharBuf*)VA_Fetch(files, i); InStream *source_stream = FSFolder_Open_In(source_folder, filepath); OutStream *outstream = RAMFolder_Open_Out(self, filepath); if (!source_stream) { THROW("Can't open %o", filepath); } if (!outstream) { THROW("Can't open %o", filepath); } OutStream_Absorb(outstream, source_stream); OutStream_Close(outstream); InStream_Close(source_stream); DECREF(outstream); DECREF(source_stream); } DECREF(files); FSFolder_Close(source_folder); DECREF(source_folder); }
static void test_stemming(TestBatchRunner *runner) { FSFolder *modules_folder = TestUtils_modules_folder(); String *path = Str_newf("analysis/snowstem/source/test/tests.json"); Hash *tests = (Hash*)Json_slurp_json((Folder*)modules_folder, path); if (!tests) { RETHROW(Err_get_error()); } String *iso; Hash *lang_data; Hash_Iterate(tests); while (Hash_Next(tests, (Obj**)&iso, (Obj**)&lang_data)) { VArray *words = (VArray*)Hash_Fetch_Utf8(lang_data, "words", 5); VArray *stems = (VArray*)Hash_Fetch_Utf8(lang_data, "stems", 5); SnowballStemmer *stemmer = SnowStemmer_new(iso); for (uint32_t i = 0, max = VA_Get_Size(words); i < max; i++) { String *word = (String*)VA_Fetch(words, i); VArray *got = SnowStemmer_Split(stemmer, word); String *stem = (String*)VA_Fetch(got, 0); TEST_TRUE(runner, stem && Str_Is_A(stem, STRING) && Str_Equals(stem, VA_Fetch(stems, i)), "Stem %s: %s", Str_Get_Ptr8(iso), Str_Get_Ptr8(word) ); DECREF(got); } DECREF(stemmer); } DECREF(tests); DECREF(modules_folder); DECREF(path); }
ProximityCompiler* ProximityCompiler_init(ProximityCompiler *self, ProximityQuery *parent, Searcher *searcher, float boost, uint32_t within) { ProximityCompilerIVARS *const ivars = ProximityCompiler_IVARS(self); ProximityQueryIVARS *const parent_ivars = ProximityQuery_IVARS(parent); Schema *schema = Searcher_Get_Schema(searcher); Similarity *sim = Schema_Fetch_Sim(schema, parent_ivars->field); VArray *terms = parent_ivars->terms; ivars->within = within; // Try harder to find a Similarity if necessary. if (!sim) { sim = Schema_Get_Similarity(schema); } // Init. Compiler_init((Compiler*)self, (Query*)parent, searcher, sim, boost); // Store IDF for the phrase. ivars->idf = 0; for (uint32_t i = 0, max = VA_Get_Size(terms); i < max; i++) { Obj *term = VA_Fetch(terms, i); int32_t doc_max = Searcher_Doc_Max(searcher); int32_t doc_freq = Searcher_Doc_Freq(searcher, parent_ivars->field,term); ivars->idf += Sim_IDF(sim, doc_freq, doc_max); } // Calculate raw weight. ivars->raw_weight = ivars->idf * ivars->boost; return self; }
void SortEx_Shrink_IMP(SortExternal *self) { SortExternalIVARS *const ivars = SortEx_IVARS(self); if (ivars->buf_max - ivars->buf_tick > 0) { size_t buf_count = SortEx_Buffer_Count(self); size_t size = buf_count * sizeof(Obj*); if (ivars->buf_tick > 0) { Obj **start = ivars->buffer + ivars->buf_tick; memmove(ivars->buffer, start, size); } ivars->buffer = (Obj**)REALLOCATE(ivars->buffer, size); ivars->buf_tick = 0; ivars->buf_max = buf_count; ivars->buf_cap = buf_count; } else { FREEMEM(ivars->buffer); ivars->buffer = NULL; ivars->buf_tick = 0; ivars->buf_max = 0; ivars->buf_cap = 0; } ivars->scratch_cap = 0; FREEMEM(ivars->scratch); ivars->scratch = NULL; for (uint32_t i = 0, max = VA_Get_Size(ivars->runs); i < max; i++) { SortExternal *run = (SortExternal*)VA_Fetch(ivars->runs, i); SortEx_Shrink(run); } }
static Folder* S_create_index() { Schema *schema = (Schema*)TestSchema_new(); RAMFolder *folder = RAMFolder_new(NULL); VArray *doc_set = TestUtils_doc_set(); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, NULL, 0); u32_t i, max; for (i = 0, max = VA_Get_Size(doc_set); i < max; i++) { static CharBuf field = ZCB_LITERAL("content"); Doc *doc = Doc_new(NULL, 0); Doc_Store(doc, &field, VA_Fetch(doc_set, i)); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); } Indexer_Commit(indexer); DECREF(doc_set); DECREF(indexer); DECREF(schema); return (Folder*)folder; }
static ORMatcher* S_ormatcher_init2(ORMatcher *self, ORMatcherIVARS *ivars, VArray *children, Similarity *sim) { // Init. PolyMatcher_init((PolyMatcher*)self, children, sim); ivars->size = 0; // Derive. ivars->max_size = VA_Get_Size(children); // Allocate. ivars->heap = (HeapedMatcherDoc**)CALLOCATE(ivars->max_size + 1, sizeof(HeapedMatcherDoc*)); // Create a pool of HMDs. Encourage CPU cache hits by using a single // allocation for all of them. size_t amount_to_malloc = (ivars->max_size + 1) * sizeof(HeapedMatcherDoc); ivars->blob = (char*)MALLOCATE(amount_to_malloc); ivars->pool = (HeapedMatcherDoc**)CALLOCATE(ivars->max_size + 1, sizeof(HeapedMatcherDoc*)); for (uint32_t i = 1; i <= ivars->max_size; i++) { size_t offset = i * sizeof(HeapedMatcherDoc); HeapedMatcherDoc *hmd = (HeapedMatcherDoc*)(ivars->blob + offset); ivars->pool[i] = hmd; } // Prime queue. for (uint32_t i = 0; i < ivars->max_size; i++) { Matcher *matcher = (Matcher*)VA_Fetch(children, i); if (matcher) { S_add_element(self, ivars, (Matcher*)INCREF(matcher), 0); } } return self; }
VArray* HeatMap_generate_proximity_boosts(HeatMap *self, VArray *spans) { VArray *boosts = VA_new(0); const uint32_t num_spans = VA_Get_Size(spans); if (num_spans > 1) { for (uint32_t i = 0, max = num_spans - 1; i < max; i++ ) { Span *span1 = (Span*)VA_Fetch(spans, i); for (uint32_t j = i + 1; j <= max; j++) { Span *span2 = (Span*)VA_Fetch(spans, j); float prox_score = HeatMap_Calc_Proximity_Boost(self, span1, span2); if (prox_score == 0) { break; } else { int32_t length = (span2->offset - span1->offset) + span2->length; VA_Push(boosts, (Obj*)Span_new(span1->offset, length, prox_score)); } } } } return boosts; }
PolySearcher* PolySearcher_init(PolySearcher *self, Schema *schema, VArray *searchers) { const uint32_t num_searchers = VA_Get_Size(searchers); int32_t *starts_array = (int32_t*)MALLOCATE(num_searchers * sizeof(int32_t)); int32_t doc_max = 0; Searcher_init((Searcher*)self, schema); PolySearcherIVARS *const ivars = PolySearcher_IVARS(self); ivars->searchers = (VArray*)INCREF(searchers); ivars->starts = NULL; // Safe cleanup. for (uint32_t i = 0; i < num_searchers; i++) { Searcher *searcher = (Searcher*)CERTIFY(VA_Fetch(searchers, i), SEARCHER); Schema *candidate = Searcher_Get_Schema(searcher); Class *orig_class = Schema_Get_Class(schema); Class *candidate_class = Schema_Get_Class(candidate); // Confirm that searchers all use the same schema. if (orig_class != candidate_class) { THROW(ERR, "Conflicting schemas: '%o', '%o'", Schema_Get_Class_Name(schema), Schema_Get_Class_Name(candidate)); } // Derive doc_max and relative start offsets. starts_array[i] = (int32_t)doc_max; doc_max += Searcher_Doc_Max(searcher); } ivars->doc_max = doc_max; ivars->starts = I32Arr_new_steal(starts_array, num_searchers); return self; }
Inversion* PolyAnalyzer_Transform_Text_IMP(PolyAnalyzer *self, String *text) { VArray *const analyzers = PolyAnalyzer_IVARS(self)->analyzers; const uint32_t num_analyzers = VA_Get_Size(analyzers); Inversion *retval; if (num_analyzers == 0) { size_t token_len = Str_Get_Size(text); const char *buf = Str_Get_Ptr8(text); Token *seed = Token_new(buf, token_len, 0, token_len, 1.0f, 1); retval = Inversion_new(seed); DECREF(seed); } else { Analyzer *first_analyzer = (Analyzer*)VA_Fetch(analyzers, 0); retval = Analyzer_Transform_Text(first_analyzer, text); for (uint32_t i = 1; i < num_analyzers; i++) { Analyzer *analyzer = (Analyzer*)VA_Fetch(analyzers, i); Inversion *new_inversion = Analyzer_Transform(analyzer, retval); DECREF(retval); retval = new_inversion; } } return retval; }
static void S_init_arena(MemoryPool *self, size_t amount) { ByteBuf *bb; // Indicate which arena we're using at present. self->tick++; if (self->tick < (int32_t)VA_Get_Size(self->arenas)) { // In recycle mode, use previously acquired memory. bb = (ByteBuf*)VA_Fetch(self->arenas, self->tick); if (amount >= BB_Get_Size(bb)) { BB_Grow(bb, amount); BB_Set_Size(bb, amount); } } else { // In add mode, get more mem from system. size_t buf_size = (amount + 1) > self->arena_size ? (amount + 1) : self->arena_size; char *ptr = (char*)MALLOCATE(buf_size); bb = BB_new_steal_bytes(ptr, buf_size - 1, buf_size); VA_Push(self->arenas, (Obj*)bb); } // Recalculate consumption to take into account blocked off space. self->consumed = 0; for (int32_t i = 0; i < self->tick; i++) { ByteBuf *bb = (ByteBuf*)VA_Fetch(self->arenas, i); self->consumed += BB_Get_Size(bb); } self->buf = BB_Get_Buf(bb); self->limit = self->buf + BB_Get_Size(bb); }
void DefDelWriter_Delete_By_Query_IMP(DefaultDeletionsWriter *self, Query *query) { DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self); Compiler *compiler = Query_Make_Compiler(query, (Searcher*)ivars->searcher, Query_Get_Boost(query), false); for (uint32_t i = 0, max = VA_Get_Size(ivars->seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i); BitVector *bit_vec = (BitVector*)VA_Fetch(ivars->bit_vecs, i); Matcher *matcher = Compiler_Make_Matcher(compiler, seg_reader, false); if (matcher) { int32_t doc_id; int32_t num_zapped = 0; // Iterate through matches, marking each doc as deleted. while (0 != (doc_id = Matcher_Next(matcher))) { num_zapped += !BitVec_Get(bit_vec, doc_id); BitVec_Set(bit_vec, doc_id); } if (num_zapped) { ivars->updated[i] = true; } DECREF(matcher); } } DECREF(compiler); }
PhraseCompiler* PhraseCompiler_init(PhraseCompiler *self, PhraseQuery *parent, Searcher *searcher, float boost) { Schema *schema = Searcher_Get_Schema(searcher); Similarity *sim = Schema_Fetch_Sim(schema, parent->field); VArray *terms = parent->terms; uint32_t i, max; // Try harder to find a Similarity if necessary. if (!sim) { sim = Schema_Get_Similarity(schema); } // Init. Compiler_init((Compiler*)self, (Query*)parent, searcher, sim, boost); // Store IDF for the phrase. self->idf = 0; for (i = 0, max = VA_Get_Size(terms); i < max; i++) { Obj *term = VA_Fetch(terms, i); int32_t doc_max = Searcher_Doc_Max(searcher); int32_t doc_freq = Searcher_Doc_Freq(searcher, parent->field, term); self->idf += Sim_IDF(sim, doc_freq, doc_max); } // Calculate raw weight. self->raw_weight = self->idf * self->boost; // Make final preparations. PhraseCompiler_Normalize(self); return self; }
Hash* DefDelWriter_Metadata_IMP(DefaultDeletionsWriter *self) { DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self); DefDelWriter_Metadata_t super_meta = (DefDelWriter_Metadata_t)SUPER_METHOD_PTR(DEFAULTDELETIONSWRITER, LUCY_DefDelWriter_Metadata); Hash *const metadata = super_meta(self); Hash *const files = Hash_new(0); for (uint32_t i = 0, max = VA_Get_Size(ivars->seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i); if (ivars->updated[i]) { BitVector *deldocs = (BitVector*)VA_Fetch(ivars->bit_vecs, i); Segment *segment = SegReader_Get_Segment(seg_reader); Hash *mini_meta = Hash_new(2); Hash_Store_Utf8(mini_meta, "count", 5, (Obj*)Str_newf("%u32", (uint32_t)BitVec_Count(deldocs))); Hash_Store_Utf8(mini_meta, "filename", 8, (Obj*)S_del_filename(self, seg_reader)); Hash_Store(files, (Obj*)Seg_Get_Name(segment), (Obj*)mini_meta); } } Hash_Store_Utf8(metadata, "files", 5, (Obj*)files); return metadata; }
void DefDelWriter_Delete_By_Term_IMP(DefaultDeletionsWriter *self, String *field, Obj *term) { DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self); for (uint32_t i = 0, max = VA_Get_Size(ivars->seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i); PostingListReader *plist_reader = (PostingListReader*)SegReader_Fetch( seg_reader, Class_Get_Name(POSTINGLISTREADER)); BitVector *bit_vec = (BitVector*)VA_Fetch(ivars->bit_vecs, i); PostingList *plist = plist_reader ? PListReader_Posting_List(plist_reader, field, term) : NULL; int32_t doc_id; int32_t num_zapped = 0; // Iterate through postings, marking each doc as deleted. if (plist) { while (0 != (doc_id = PList_Next(plist))) { num_zapped += !BitVec_Get(bit_vec, doc_id); BitVec_Set(bit_vec, doc_id); } if (num_zapped) { ivars->updated[i] = true; } DECREF(plist); } } }
CharBuf* IxManager_make_snapshot_filename(IndexManager *self) { VArray *files = Folder_List(self->folder); u32_t i, max; i32_t max_gen = 0; for (i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *file = (CharBuf*)VA_Fetch(files, i); if ( CB_Starts_With_Str(file, "snapshot_", 9) && CB_Ends_With_Str(file, ".json", 5) ) { i32_t gen = IxFileNames_extract_gen(file); if (gen > max_gen) { max_gen = gen; } } } DECREF(files); { i32_t new_gen = max_gen + 1; CharBuf *base_36 = StrHelp_to_base36(new_gen); CharBuf *snapfile = CB_newf("snapshot_%o.json", base_36); DECREF(base_36); return snapfile; } }
void DefDelWriter_Finish_IMP(DefaultDeletionsWriter *self) { DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self); Folder *const folder = ivars->folder; for (uint32_t i = 0, max = VA_Get_Size(ivars->seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(ivars->seg_readers, i); if (ivars->updated[i]) { BitVector *deldocs = (BitVector*)VA_Fetch(ivars->bit_vecs, i); int32_t doc_max = SegReader_Doc_Max(seg_reader); double used = (doc_max + 1) / 8.0; uint32_t byte_size = (uint32_t)ceil(used); uint32_t new_max = byte_size * 8 - 1; String *filename = S_del_filename(self, seg_reader); OutStream *outstream = Folder_Open_Out(folder, filename); if (!outstream) { RETHROW(INCREF(Err_get_error())); } // Ensure that we have 1 bit for each doc in segment. BitVec_Grow(deldocs, new_max); // Write deletions data and clean up. OutStream_Write_Bytes(outstream, (char*)BitVec_Get_Raw_Bits(deldocs), byte_size); OutStream_Close(outstream); DECREF(outstream); DECREF(filename); } } Seg_Store_Metadata_Utf8(ivars->segment, "deletions", 9, (Obj*)DefDelWriter_Metadata(self)); }
VArray* IxManager_segreaders_to_merge(IndexManager *self, PolyReader *reader, bool_t all) { VArray *seg_readers = VA_Shallow_Copy(PolyReader_Get_Seg_Readers(reader)); UNUSED_VAR(self); if (!all) { u32_t i; u32_t total_docs = 0; u32_t threshold = 0; const u32_t num_seg_readers = VA_Get_Size(seg_readers); /* Sort by ascending size in docs. */ VA_Sort(seg_readers, S_compare_doc_count); /* Find sparsely populated segments. */ for (i = 0; i < num_seg_readers; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(seg_readers, i); total_docs += SegReader_Doc_Count(seg_reader); if (total_docs < Math_fibonacci(i + 5)) { threshold = i + 1; } } VA_Splice(seg_readers, threshold, num_seg_readers); } return seg_readers; }
Segment* IxManager_make_new_segment(IndexManager *self, Snapshot *snapshot) { VArray *files = Snapshot_List(snapshot); u32_t i, max; i32_t highest_seg_num = 0; CharBuf *seg_name = CB_new(20); Segment *segment; /* Find highest seg num. */ for (i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *file = (CharBuf*)VA_Fetch(files, i); if (CB_Starts_With_Str(file, "seg_", 4)) { i32_t seg_num = IxFileNames_extract_gen(file); if (seg_num > highest_seg_num) { highest_seg_num = seg_num; } } } /* Create segment with num one greater than current max. */ S_cat_seg_name(seg_name, highest_seg_num + 1); segment = Seg_new(seg_name, self->folder); DECREF(seg_name); DECREF(files); return segment; }
static uint8_t* S_find_endpost(SortExternal *self, SortExternalIVARS *ivars) { uint8_t *endpost = NULL; const size_t width = ivars->width; for (uint32_t i = 0, max = VA_Get_Size(ivars->runs); i < max; i++) { // Get a run and retrieve the last item in its cache. SortExternal *const run = (SortExternal*)VA_Fetch(ivars->runs, i); SortExternalIVARS *const run_ivars = SortEx_IVARS(run); const uint32_t tick = run_ivars->cache_max - 1; if (tick >= run_ivars->cache_cap || run_ivars->cache_max < 1) { THROW(ERR, "Invalid SortExternal cache access: %u32 %u32 %u32", tick, run_ivars->cache_max, run_ivars->cache_cap); } else { // Cache item with the highest sort value currently held in memory // by the run. uint8_t *candidate = run_ivars->cache + tick * width; // If it's the first run, item is automatically the new endpost. if (i == 0) { endpost = candidate; } // If it's less than the current endpost, it's the new endpost. else if (SortEx_Compare(self, candidate, endpost) < 0) { endpost = candidate; } } } return endpost; }
PolyReader* PolyReader_init(PolyReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, IndexManager *manager, VArray *sub_readers) { PolyReaderIVARS *const ivars = PolyReader_IVARS(self); ivars->doc_max = 0; ivars->del_count = 0; if (sub_readers) { uint32_t num_segs = VA_Get_Size(sub_readers); VArray *segments = VA_new(num_segs); for (uint32_t i = 0; i < num_segs; i++) { SegReader *seg_reader = (SegReader*)CERTIFY(VA_Fetch(sub_readers, i), SEGREADER); VA_Push(segments, INCREF(SegReader_Get_Segment(seg_reader))); } IxReader_init((IndexReader*)self, schema, folder, snapshot, segments, -1, manager); DECREF(segments); S_init_sub_readers(self, sub_readers); } else { IxReader_init((IndexReader*)self, schema, folder, snapshot, NULL, -1, manager); ivars->sub_readers = VA_new(0); ivars->offsets = I32Arr_new_steal(NULL, 0); } return self; }
PhraseCompiler* PhraseCompiler_init(PhraseCompiler *self, PhraseQuery *parent, Searchable *searchable, float boost) { Schema *schema = Searchable_Get_Schema(searchable); Similarity *sim = Schema_Fetch_Sim(schema, parent->field); VArray *terms = parent->terms; u32_t i, max; /* Try harder to find a Similarity if necessary. */ if (!sim) { sim = Schema_Get_Similarity(schema); } /* Init. */ Compiler_init((Compiler*)self, (Query*)parent, searchable, sim, boost); /* Store IDF for the phrase. */ self->idf = 0; for (i = 0, max = VA_Get_Size(terms); i < max; i++) { Obj *term = VA_Fetch(terms, i); self->idf += Sim_IDF(sim, searchable, parent->field, term); } /* Calculate raw weight. */ self->raw_weight = self->idf * self->boost; /* Make final preparations. */ PhraseCompiler_Normalize(self); return self; }
// Create all the spans needed by HeatMap_Flatten_Spans, based on the source // offsets and lengths... but leave the scores at 0. static VArray* S_flattened_but_empty_spans(VArray *spans) { const uint32_t num_spans = VA_Get_Size(spans); int32_t *bounds = (int32_t*)MALLOCATE((num_spans * 2) * sizeof(int32_t)); // Assemble a list of all unique start/end boundaries. for (uint32_t i = 0; i < num_spans; i++) { Span *span = (Span*)VA_Fetch(spans, i); bounds[i] = span->offset; bounds[i + num_spans] = span->offset + span->length; } Sort_quicksort(bounds, num_spans * 2, sizeof(uint32_t), S_compare_i32, NULL); uint32_t num_bounds = 0; int32_t last = I32_MAX; for (uint32_t i = 0; i < num_spans * 2; i++) { if (bounds[i] != last) { bounds[num_bounds++] = bounds[i]; last = bounds[i]; } } // Create one Span for each zone between two bounds. VArray *flattened = VA_new(num_bounds - 1); for (uint32_t i = 0; i < num_bounds - 1; i++) { int32_t start = bounds[i]; int32_t length = bounds[i + 1] - start; VA_Push(flattened, (Obj*)Span_new(start, length, 0.0f)); } FREEMEM(bounds); return flattened; }
void Indexer_prepare_commit(Indexer *self) { VArray *seg_readers = PolyReader_Get_Seg_Readers(self->polyreader); uint32_t num_seg_readers = VA_Get_Size(seg_readers); bool_t merge_happened = false; if ( !self->write_lock || self->prepared ) { THROW(ERR, "Can't call Prepare_Commit() more than once"); } // Merge existing index data. if (num_seg_readers) { merge_happened = S_maybe_merge(self, seg_readers); } // Add a new segment and write a new snapshot file if... if ( Seg_Get_Count(self->segment) // Docs/segs added. || merge_happened // Some segs merged. || !Snapshot_Num_Entries(self->snapshot) // Initializing index. || DelWriter_Updated(self->del_writer) ) { Folder *folder = self->folder; Schema *schema = self->schema; Snapshot *snapshot = self->snapshot; CharBuf *old_schema_name = S_find_schema_file(snapshot); uint64_t schema_gen = old_schema_name ? IxFileNames_extract_gen(old_schema_name) + 1 : 1; char base36[StrHelp_MAX_BASE36_BYTES]; CharBuf *new_schema_name; StrHelp_to_base36(schema_gen, &base36); new_schema_name = CB_newf("schema_%s.json", base36); // Finish the segment, write schema file. SegWriter_Finish(self->seg_writer); Schema_Write(schema, folder, new_schema_name); if (old_schema_name) { Snapshot_Delete_Entry(snapshot, old_schema_name); } Snapshot_Add_Entry(snapshot, new_schema_name); DECREF(new_schema_name); // Write temporary snapshot file. DECREF(self->snapfile); self->snapfile = IxManager_Make_Snapshot_Filename(self->manager); CB_Cat_Trusted_Str(self->snapfile, ".temp", 5); Folder_Delete(folder, self->snapfile); Snapshot_Write_File(snapshot, folder, self->snapfile); self->needs_commit = true; } // Close reader, so that we can delete its files if appropriate. PolyReader_Close(self->polyreader); self->prepared = true; }