uint32_t PolyReader_sub_tick(I32Array *offsets, int32_t doc_id) { int32_t size = I32Arr_Get_Size(offsets); if (size == 0) { return 0; } int32_t lo = -1; int32_t hi = size; while (hi - lo > 1) { int32_t mid = lo + ((hi - lo) / 2); int32_t offset = I32Arr_Get(offsets, mid); if (doc_id <= offset) { hi = mid; } else { lo = mid; } } if (hi == size) { hi--; } while (hi > 0) { int32_t offset = I32Arr_Get(offsets, hi); if (doc_id <= offset) { hi--; } else { break; } } return hi; }
static SeriesMatcher* S_make_series_matcher(I32Array *doc_ids, I32Array *offsets, int32_t doc_max) { int32_t num_doc_ids = I32Arr_Get_Size(doc_ids); int32_t num_matchers = I32Arr_Get_Size(offsets); VArray *matchers = VA_new(num_matchers); int32_t tick = 0; int32_t i; // Divvy up doc_ids by segment into BitVectors. for (i = 0; i < num_matchers; i++) { int32_t offset = I32Arr_Get(offsets, i); int32_t max = i == num_matchers - 1 ? doc_max + 1 : I32Arr_Get(offsets, i + 1); BitVector *bit_vec = BitVec_new(max - offset); while (tick < num_doc_ids) { int32_t doc_id = I32Arr_Get(doc_ids, tick); if (doc_id > max) { break; } else { tick++; } BitVec_Set(bit_vec, doc_id - offset); } VA_Push(matchers, (Obj*)BitVecMatcher_new(bit_vec)); DECREF(bit_vec); } SeriesMatcher *series_matcher = SeriesMatcher_new(matchers, offsets); DECREF(matchers); return series_matcher; }
uint32_t IxManager_Choose_Sparse_IMP(IndexManager *self, I32Array *doc_counts) { UNUSED_VAR(self); uint32_t threshold = 0; int32_t total_docs = 0; const uint32_t num_candidates = (uint32_t)I32Arr_Get_Size(doc_counts); // Find sparsely populated segments. for (uint32_t i = 0; i < num_candidates; i++) { uint32_t num_segs_when_done = num_candidates - threshold + 1; total_docs += I32Arr_Get(doc_counts, i); if (total_docs < (int32_t)S_fibonacci(num_segs_when_done + 5)) { threshold = i + 1; } } // If recycling, try not to get stuck merging the same big segment over // and over on small commits. if (threshold == 1 && num_candidates > 2) { int32_t this_seg_doc_count = I32Arr_Get(doc_counts, 0); int32_t next_seg_doc_count = I32Arr_Get(doc_counts, 1); // Try to merge 2 segments worth of stuff, so long as the next segment // is less than double the size. if (next_seg_doc_count / 2 < this_seg_doc_count) { threshold = 2; } } return threshold; }
Vector* TermCompiler_Highlight_Spans_IMP(TermCompiler *self, Searcher *searcher, DocVector *doc_vec, String *field) { TermCompilerIVARS *const ivars = TermCompiler_IVARS(self); TermQueryIVARS *const parent_ivars = TermQuery_IVARS((TermQuery*)ivars->parent); Vector *spans = Vec_new(0); TermVector *term_vector; I32Array *starts, *ends; UNUSED_VAR(searcher); if (!Str_Equals(parent_ivars->field, (Obj*)field)) { return spans; } // Add all starts and ends. term_vector = DocVec_Term_Vector(doc_vec, field, (String*)parent_ivars->term); if (!term_vector) { return spans; } starts = TV_Get_Start_Offsets(term_vector); ends = TV_Get_End_Offsets(term_vector); for (size_t i = 0, max = I32Arr_Get_Size(starts); i < max; i++) { int32_t start = I32Arr_Get(starts, i); int32_t length = I32Arr_Get(ends, i) - start; Vec_Push(spans, (Obj*)Span_new(start, length, TermCompiler_Get_Weight(self))); } DECREF(term_vector); return spans; }
static void test_To_Array(TestBatchRunner *runner) { uint64_t *source_ints = TestUtils_random_u64s(NULL, 20, 0, 200); BitVector *bit_vec = BitVec_new(0); I32Array *array; unsigned num_unique = 0; // Unique the random ints. qsort(source_ints, 20, sizeof(uint64_t), S_compare_u64s); for (unsigned i = 0; i < 19; i++) { if (source_ints[i] != source_ints[i + 1]) { source_ints[num_unique] = source_ints[i]; num_unique++; } } // Set bits. for (unsigned i = 0; i < num_unique; i++) { BitVec_Set(bit_vec, (size_t)source_ints[i]); } // Create the array and compare it to the source. array = BitVec_To_Array(bit_vec); unsigned i; for (i = 0; i < num_unique; i++) { if (I32Arr_Get(array, (size_t)i) != (int32_t)source_ints[i]) { break; } } TEST_UINT_EQ(runner, i, num_unique, "To_Array (%u == %u)", i, num_unique); DECREF(array); DECREF(bit_vec); FREEMEM(source_ints); }
static void test_To_Array(TestBatch *batch) { uint64_t *source_ints = TestUtils_random_u64s(NULL, 20, 0, 200); BitVector *bit_vec = BitVec_new(0); I32Array *array; long num_unique = 0; long i; // Unique the random ints. Sort_quicksort(source_ints, 20, sizeof(uint64_t), S_compare_u64s, NULL); for (i = 0; i < 19; i++) { if (source_ints[i] != source_ints[i + 1]) { source_ints[num_unique] = source_ints[i]; num_unique++; } } // Set bits. for (i = 0; i < num_unique; i++) { BitVec_Set(bit_vec, (uint32_t)source_ints[i]); } // Create the array and compare it to the source. array = BitVec_To_Array(bit_vec); for (i = 0; i < num_unique; i++) { if (I32Arr_Get(array, i) != (int32_t)source_ints[i]) { break; } } TEST_INT_EQ(batch, i, num_unique, "To_Array (%ld == %ld)", i, num_unique); DECREF(array); DECREF(bit_vec); FREEMEM(source_ints); }
PolyPostingList* PolyPList_init(PolyPostingList *self, const CharBuf *field, VArray *readers, I32Array *starts) { u32_t i; const u32_t num_readers = VA_Get_Size(readers); /* Init. */ self->tick = 0; self->current = NULL; /* Assign. */ self->field = CB_Clone(field); /* Get sub-posting_lists and assign offsets. */ self->sub_plists = VA_new(num_readers); for (i = 0; i < num_readers; i++) { PostingsReader *const post_reader = (PostingsReader*)ASSERT_IS_A( VA_Fetch(readers, i), POSTINGSREADER); i32_t offset = I32Arr_Get(starts, i); SegPostingList *sub_plist = (SegPostingList*)PostReader_Posting_List( post_reader, field, NULL); if (sub_plist) { ASSERT_IS_A(sub_plist, SEGPOSTINGLIST); SegPList_Set_Doc_Base(sub_plist, offset); VA_Push(self->sub_plists, (Obj*)sub_plist); } } self->num_subs = VA_Get_Size(self->sub_plists); return self; }
// Adjust current doc id. We create our own doc_count rather than rely on // SegReader's number because the DeletionsWriter and the SegReader are // probably out of sync. static void S_adjust_doc_id(SegWriter *self, SegReader *reader, I32Array *doc_map) { SegWriterIVARS *const ivars = SegWriter_IVARS(self); int32_t doc_count = SegReader_Doc_Max(reader); for (size_t i = 1, max = I32Arr_Get_Size(doc_map); i < max; i++) { if (I32Arr_Get(doc_map, i) == 0) { doc_count--; } } Seg_Increment_Count(ivars->segment, doc_count); }
DocVector* PolySearcher_Fetch_Doc_Vec_IMP(PolySearcher *self, int32_t doc_id) { PolySearcherIVARS *const ivars = PolySearcher_IVARS(self); uint32_t tick = PolyReader_sub_tick(ivars->starts, doc_id); Searcher *searcher = (Searcher*)VA_Fetch(ivars->searchers, tick); int32_t start = I32Arr_Get(ivars->starts, tick); if (!searcher) { THROW(ERR, "Invalid doc id: %i32", doc_id); } return Searcher_Fetch_Doc_Vec(searcher, doc_id - start); }
int32_t MockMatcher_next(MockMatcher* self) { MockMatcherIVARS *const ivars = MockMatcher_IVARS(self); if (++ivars->tick >= (int32_t)ivars->size) { ivars->tick--; return 0; } return I32Arr_Get(ivars->doc_ids, ivars->tick); }
DocVector* PolyHLReader_fetch_doc_vec(PolyHighlightReader *self, int32_t doc_id) { uint32_t seg_tick = PolyReader_sub_tick(self->offsets, doc_id); int32_t offset = I32Arr_Get(self->offsets, seg_tick); HighlightReader *sub_reader = (HighlightReader*)VA_Fetch(self->readers, seg_tick); if (!sub_reader) { THROW(ERR, "Invalid doc_id: %i32", doc_id); } return HLReader_Fetch_Doc_Vec(sub_reader, doc_id - offset); }
DocVector* PolyHLReader_Fetch_Doc_Vec_IMP(PolyHighlightReader *self, int32_t doc_id) { PolyHighlightReaderIVARS *const ivars = PolyHLReader_IVARS(self); uint32_t seg_tick = PolyReader_sub_tick(ivars->offsets, doc_id); int32_t offset = I32Arr_Get(ivars->offsets, seg_tick); HighlightReader *sub_reader = (HighlightReader*)Vec_Fetch(ivars->readers, seg_tick); if (!sub_reader) { THROW(ERR, "Invalid doc_id: %i32", doc_id); } return HLReader_Fetch_Doc_Vec(sub_reader, doc_id - offset); }
HitDoc* PolySearcher_Fetch_Doc_IMP(PolySearcher *self, int32_t doc_id) { PolySearcherIVARS *const ivars = PolySearcher_IVARS(self); uint32_t tick = PolyReader_sub_tick(ivars->starts, doc_id); Searcher *searcher = (Searcher*)VA_Fetch(ivars->searchers, tick); int32_t offset = I32Arr_Get(ivars->starts, tick); if (!searcher) { THROW(ERR, "Invalid doc id: %i32", doc_id); } HitDoc *hit_doc = Searcher_Fetch_Doc(searcher, doc_id - offset); HitDoc_Set_Doc_ID(hit_doc, doc_id); return hit_doc; }
void DefDelWriter_Delete_By_Doc_ID_IMP(DefaultDeletionsWriter *self, int32_t doc_id) { DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self); uint32_t sub_tick = PolyReader_sub_tick(ivars->seg_starts, doc_id); BitVector *bit_vec = (BitVector*)VA_Fetch(ivars->bit_vecs, sub_tick); uint32_t offset = I32Arr_Get(ivars->seg_starts, sub_tick); int32_t seg_doc_id = doc_id - offset; if (!BitVec_Get(bit_vec, seg_doc_id)) { ivars->updated[sub_tick] = true; BitVec_Set(bit_vec, seg_doc_id); } }
void PolySearcher_Collect_IMP(PolySearcher *self, Query *query, Collector *collector) { PolySearcherIVARS *const ivars = PolySearcher_IVARS(self); VArray *const searchers = ivars->searchers; I32Array *starts = ivars->starts; for (uint32_t i = 0, max = VA_Get_Size(searchers); i < max; i++) { int32_t start = I32Arr_Get(starts, i); Searcher *searcher = (Searcher*)VA_Fetch(searchers, i); OffsetCollector *offset_coll = OffsetColl_new(collector, start); Searcher_Collect(searcher, query, (Collector*)offset_coll); DECREF(offset_coll); } }
uint32_t SortFieldWriter_Refill_IMP(SortFieldWriter *self) { SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self); if (!ivars->sort_cache) { return 0; } // Sanity check, then reset the buffer and prepare to start loading items. uint32_t buf_count = SortFieldWriter_Buffer_Count(self); if (buf_count) { THROW(ERR, "Refill called but buffer contains %u32 items", buf_count); } SortFieldWriter_Clear_Buffer(self); Counter_Reset(ivars->counter); S_lazy_init_sorted_ids(self); const int32_t null_ord = ivars->null_ord; I32Array *const doc_map = ivars->doc_map; SortCache *const sort_cache = ivars->sort_cache; uint32_t count = 0; while (ivars->run_tick <= ivars->run_max && Counter_Get_Value(ivars->counter) < ivars->mem_thresh ) { int32_t raw_doc_id = ivars->sorted_ids[ivars->run_tick]; int32_t ord = SortCache_Ordinal(sort_cache, raw_doc_id); if (ord != null_ord) { int32_t remapped = doc_map ? I32Arr_Get(doc_map, raw_doc_id) : raw_doc_id; if (remapped) { Obj *val = SortCache_Value(sort_cache, ord); SortFieldWriter_Add(self, remapped, val); count++; DECREF(val); } } ivars->run_tick++; } if (ivars->run_tick > ivars->run_max) { DECREF(ivars->sort_cache); ivars->sort_cache = NULL; FREEMEM(ivars->sorted_ids); ivars->sorted_ids = NULL; } return count; }
HitDoc* PolyDocReader_Fetch_Doc_IMP(PolyDocReader *self, int32_t doc_id) { PolyDocReaderIVARS *const ivars = PolyDocReader_IVARS(self); uint32_t seg_tick = PolyReader_sub_tick(ivars->offsets, doc_id); int32_t offset = I32Arr_Get(ivars->offsets, seg_tick); DocReader *doc_reader = (DocReader*)Vec_Fetch(ivars->readers, seg_tick); HitDoc *hit_doc = NULL; if (!doc_reader) { THROW(ERR, "Invalid doc_id: %i32", doc_id); } else { hit_doc = DocReader_Fetch_Doc(doc_reader, doc_id - offset); HitDoc_Set_Doc_ID(hit_doc, doc_id); } return hit_doc; }
Obj* PolyDocReader_fetch(PolyDocReader *self, i32_t doc_id, float score, i32_t offset) { u32_t seg_tick = PolyReader_sub_tick(self->offsets, doc_id); i32_t my_offset = I32Arr_Get(self->offsets, seg_tick); DocReader *doc_reader = (DocReader*)VA_Fetch(self->readers, seg_tick); Obj *hit = NULL; if (!doc_reader) { THROW("Invalid doc_id: %i32", doc_id); } else { hit = DocReader_Fetch(doc_reader, doc_id - my_offset, score, offset + my_offset); } return hit; }
TopDocs* PolySearcher_Top_Docs_IMP(PolySearcher *self, Query *query, uint32_t num_wanted, SortSpec *sort_spec) { PolySearcherIVARS *const ivars = PolySearcher_IVARS(self); Schema *schema = PolySearcher_Get_Schema(self); VArray *searchers = ivars->searchers; I32Array *starts = ivars->starts; HitQueue *hit_q = sort_spec ? HitQ_new(schema, sort_spec, num_wanted) : HitQ_new(NULL, NULL, num_wanted); uint32_t total_hits = 0; Compiler *compiler = Query_Is_A(query, COMPILER) ? ((Compiler*)INCREF(query)) : Query_Make_Compiler(query, (Searcher*)self, Query_Get_Boost(query), false); for (uint32_t i = 0, max = VA_Get_Size(searchers); i < max; i++) { Searcher *searcher = (Searcher*)VA_Fetch(searchers, i); int32_t base = I32Arr_Get(starts, i); TopDocs *top_docs = Searcher_Top_Docs(searcher, (Query*)compiler, num_wanted, sort_spec); VArray *sub_match_docs = TopDocs_Get_Match_Docs(top_docs); total_hits += TopDocs_Get_Total_Hits(top_docs); S_modify_doc_ids(sub_match_docs, base); for (uint32_t j = 0, jmax = VA_Get_Size(sub_match_docs); j < jmax; j++) { MatchDoc *match_doc = (MatchDoc*)VA_Fetch(sub_match_docs, j); if (!HitQ_Insert(hit_q, INCREF(match_doc))) { break; } } DECREF(top_docs); } VArray *match_docs = HitQ_Pop_All(hit_q); TopDocs *retval = TopDocs_new(match_docs, total_hits); DECREF(match_docs); DECREF(compiler); DECREF(hit_q); return retval; }
i32_t SeriesMatcher_advance(SeriesMatcher *self, i32_t target) { if (target >= self->next_offset) { /* Proceed to next matcher or bail. */ if (self->tick < self->num_matchers) { while (1) { u32_t next_offset = self->tick + 1 == self->num_matchers ? I32_MAX : I32Arr_Get(self->offsets, self->tick + 1); self->current_matcher = (Matcher*)VA_Fetch(self->matchers, self->tick); self->current_offset = self->next_offset; self->next_offset = next_offset; self->doc_id = next_offset - 1; self->tick++; if ( self->current_matcher != NULL || self->tick >= self->num_matchers ) { break; } } return SeriesMatcher_advance(self, target); /* Recurse. */ } else { /* We're done. */ self->doc_id = 0; return 0; } } else { i32_t target_minus_offset = target - self->current_offset; i32_t found = Matcher_Advance(self->current_matcher, target_minus_offset); if (found) { self->doc_id = found + self->current_offset; return self->doc_id; } else { return SeriesMatcher_advance(self, self->next_offset); /* Recurse. */ } } }
void HLWriter_Add_Segment_IMP(HighlightWriter *self, SegReader *reader, I32Array *doc_map) { HighlightWriterIVARS *const ivars = HLWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { DefaultHighlightReader *hl_reader = (DefaultHighlightReader*)CERTIFY( SegReader_Obtain(reader, Class_Get_Name(HIGHLIGHTREADER)), DEFAULTHIGHLIGHTREADER); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = ivars->ix_out; int32_t orig; ByteBuf *bb = BB_new(0); for (orig = 1; orig <= doc_max; orig++) { // Skip deleted docs. if (doc_map && !I32Arr_Get(doc_map, orig)) { continue; } // Write file pointer. OutStream_Write_I64(ix_out, OutStream_Tell(dat_out)); // Copy the raw record. DefHLReader_Read_Record(hl_reader, orig, bb); OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb)); BB_Set_Size(bb, 0); } DECREF(bb); } }
void DocWriter_Add_Segment_IMP(DocWriter *self, SegReader *reader, I32Array *doc_map) { DocWriterIVARS *const ivars = DocWriter_IVARS(self); int32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { // Bail if the supplied segment is empty. return; } else { OutStream *const dat_out = S_lazy_init(self); OutStream *const ix_out = ivars->ix_out; ByteBuf *const buffer = BB_new(0); DefaultDocReader *const doc_reader = (DefaultDocReader*)CERTIFY( SegReader_Obtain(reader, VTable_Get_Name(DOCREADER)), DEFAULTDOCREADER); for (int32_t i = 1, max = SegReader_Doc_Max(reader); i <= max; i++) { if (I32Arr_Get(doc_map, i)) { int64_t start = OutStream_Tell(dat_out); // Copy record over. DefDocReader_Read_Record(doc_reader, buffer, i); char *buf = BB_Get_Buf(buffer); size_t size = BB_Get_Size(buffer); OutStream_Write_Bytes(dat_out, buf, size); // Write file pointer. OutStream_Write_I64(ix_out, start); } } DECREF(buffer); } }
void HLWriter_add_segment(HighlightWriter *self, SegReader *reader, I32Array *doc_map) { i32_t doc_max = SegReader_Doc_Max(reader); if (doc_max == 0) { /* Bail if the supplied segment is empty. */ return; } else { DefaultHighlightReader *hl_reader = (DefaultHighlightReader*) ASSERT_IS_A(SegReader_Obtain(reader, HIGHLIGHTREADER.name), DEFAULTHIGHLIGHTREADER); OutStream *dat_out = S_lazy_init(self); OutStream *ix_out = self->ix_out; i32_t orig; ByteBuf *bb = BB_new(0); for (orig = 1; orig <= doc_max; orig++) { /* Skip deleted docs. */ if (doc_map && !I32Arr_Get(doc_map, orig)) continue; /* Write file pointer. */ OutStream_Write_U64( ix_out, OutStream_Tell(dat_out) ); /* Copy the raw record. */ DefHLReader_Read_Record(hl_reader, orig, bb); OutStream_Write_Bytes(dat_out, bb->ptr, bb->size); bb->size = 0; } DECREF(bb); } }
static void S_do_test_matrix(TestBatch *batch, int32_t doc_max, int32_t first_doc_id, int32_t doc_inc, int32_t offset_inc) { I32Array *doc_ids = S_generate_match_list(first_doc_id, doc_max, doc_inc); I32Array *offsets = S_generate_match_list(0, doc_max, offset_inc); SeriesMatcher *series_matcher = S_make_series_matcher(doc_ids, offsets, doc_max); uint32_t num_in_agreement = 0; int32_t got; while (0 != (got = SeriesMatcher_Next(series_matcher))) { if (got != I32Arr_Get(doc_ids, num_in_agreement)) { break; } num_in_agreement++; } TEST_INT_EQ(batch, num_in_agreement, I32Arr_Get_Size(doc_ids), "doc_max=%d first_doc_id=%d doc_inc=%d offset_inc=%d", doc_max, first_doc_id, doc_inc, offset_inc); DECREF(doc_ids); DECREF(offsets); DECREF(series_matcher); }
int32_t MockMatcher_get_doc_id(MockMatcher* self) { MockMatcherIVARS *const ivars = MockMatcher_IVARS(self); return I32Arr_Get(ivars->doc_ids, ivars->tick); }
static bool S_merge_updated_deletions(BackgroundMerger *self) { BackgroundMergerIVARS *const ivars = BGMerger_IVARS(self); Hash *updated_deletions = NULL; PolyReader *new_polyreader = PolyReader_open((Obj*)ivars->folder, NULL, NULL); Vector *new_seg_readers = PolyReader_Get_Seg_Readers(new_polyreader); Vector *old_seg_readers = PolyReader_Get_Seg_Readers(ivars->polyreader); Hash *new_segs = Hash_new(Vec_Get_Size(new_seg_readers)); for (uint32_t i = 0, max = Vec_Get_Size(new_seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(new_seg_readers, i); String *seg_name = SegReader_Get_Seg_Name(seg_reader); Hash_Store(new_segs, seg_name, INCREF(seg_reader)); } for (uint32_t i = 0, max = Vec_Get_Size(old_seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(old_seg_readers, i); String *seg_name = SegReader_Get_Seg_Name(seg_reader); // If this segment was merged away... if (Hash_Fetch(ivars->doc_maps, seg_name)) { SegReader *new_seg_reader = (SegReader*)CERTIFY( Hash_Fetch(new_segs, seg_name), SEGREADER); int32_t old_del_count = SegReader_Del_Count(seg_reader); int32_t new_del_count = SegReader_Del_Count(new_seg_reader); // ... were any new deletions applied against it? if (old_del_count != new_del_count) { DeletionsReader *del_reader = (DeletionsReader*)SegReader_Obtain( new_seg_reader, Class_Get_Name(DELETIONSREADER)); if (!updated_deletions) { updated_deletions = Hash_new(max); } Hash_Store(updated_deletions, seg_name, (Obj*)DelReader_Iterator(del_reader)); } } } DECREF(new_polyreader); DECREF(new_segs); if (!updated_deletions) { return false; } else { PolyReader *merge_polyreader = PolyReader_open((Obj*)ivars->folder, ivars->snapshot, NULL); Vector *merge_seg_readers = PolyReader_Get_Seg_Readers(merge_polyreader); Snapshot *latest_snapshot = Snapshot_Read_File(Snapshot_new(), ivars->folder, NULL); int64_t new_seg_num = IxManager_Highest_Seg_Num(ivars->manager, latest_snapshot) + 1; Segment *new_segment = Seg_new(new_seg_num); SegWriter *seg_writer = SegWriter_new(ivars->schema, ivars->snapshot, new_segment, merge_polyreader); DeletionsWriter *del_writer = SegWriter_Get_Del_Writer(seg_writer); int64_t merge_seg_num = Seg_Get_Number(ivars->segment); uint32_t seg_tick = INT32_MAX; int32_t offset = INT32_MAX; SegWriter_Prep_Seg_Dir(seg_writer); for (uint32_t i = 0, max = Vec_Get_Size(merge_seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(merge_seg_readers, i); if (SegReader_Get_Seg_Num(seg_reader) == merge_seg_num) { I32Array *offsets = PolyReader_Offsets(merge_polyreader); seg_tick = i; offset = I32Arr_Get(offsets, seg_tick); DECREF(offsets); } } if (offset == INT32_MAX) { THROW(ERR, "Failed sanity check"); } HashIterator *iter = HashIter_new(updated_deletions); while (HashIter_Next(iter)) { String *seg_name = HashIter_Get_Key(iter); Matcher *deletions = (Matcher*)HashIter_Get_Value(iter); I32Array *doc_map = (I32Array*)CERTIFY( Hash_Fetch(ivars->doc_maps, seg_name), I32ARRAY); int32_t del; while (0 != (del = Matcher_Next(deletions))) { // Find the slot where the deleted doc resides in the // rewritten segment. If the doc was already deleted when we // were merging, do nothing. int32_t remapped = I32Arr_Get(doc_map, del); if (remapped) { // It's a new deletion, so carry it forward and zap it in // the rewritten segment. DelWriter_Delete_By_Doc_ID(del_writer, remapped + offset); } } } DECREF(iter); // Finish the segment and clean up. DelWriter_Finish(del_writer); SegWriter_Finish(seg_writer); DECREF(seg_writer); DECREF(new_segment); DECREF(latest_snapshot); DECREF(merge_polyreader); DECREF(updated_deletions); } return true; }
uint32_t PostPool_Refill_IMP(PostingPool *self) { PostingPoolIVARS *const ivars = PostPool_IVARS(self); Lexicon *const lexicon = ivars->lexicon; PostingList *const plist = ivars->plist; I32Array *const doc_map = ivars->doc_map; const uint32_t mem_thresh = ivars->mem_thresh; const int32_t doc_base = ivars->doc_base; uint32_t num_elems = 0; // number of items recovered String *term_text = NULL; if (ivars->lexicon == NULL) { return 0; } else { term_text = (String*)Lex_Get_Term(lexicon); } // Make sure buffer is empty. if (ivars->buf_max - ivars->buf_tick > 0) { THROW(ERR, "Refill called but buffer contains %u32 items", ivars->buf_max - ivars->buf_tick); } ivars->buf_max = 0; ivars->buf_tick = 0; // Ditch old MemoryPool and get another. DECREF(ivars->mem_pool); ivars->mem_pool = MemPool_new(0); MemoryPool *const mem_pool = ivars->mem_pool; MemoryPoolIVARS *const mem_pool_ivars = MemPool_IVARS(mem_pool); while (1) { if (ivars->post_count == 0) { // Read a term. if (Lex_Next(lexicon)) { ivars->post_count = Lex_Doc_Freq(lexicon); term_text = (String*)Lex_Get_Term(lexicon); if (term_text && !Obj_Is_A((Obj*)term_text, STRING)) { THROW(ERR, "Only String terms are supported for now"); } Posting *posting = PList_Get_Posting(plist); Post_Set_Doc_ID(posting, doc_base); ivars->last_doc_id = doc_base; } // Bail if we've read everything in this run. else { break; } } // Bail if we've hit the ceiling for this run's buffer. if (mem_pool_ivars->consumed >= mem_thresh && num_elems > 0) { break; } // Read a posting from the input stream. RawPosting *rawpost = PList_Read_Raw(plist, ivars->last_doc_id, term_text, mem_pool); RawPostingIVARS *const rawpost_ivars = RawPost_IVARS(rawpost); ivars->last_doc_id = rawpost_ivars->doc_id; ivars->post_count--; // Skip deletions. if (doc_map != NULL) { const int32_t remapped = I32Arr_Get(doc_map, rawpost_ivars->doc_id - doc_base); if (!remapped) { continue; } rawpost_ivars->doc_id = remapped; } // Add to the run's buffer. if (num_elems >= ivars->buf_cap) { size_t new_cap = Memory_oversize(num_elems + 1, sizeof(Obj*)); PostPool_Grow_Buffer(self, new_cap); } ivars->buffer[num_elems] = (Obj*)rawpost; num_elems++; } // Reset the buffer array position and length; remember file pos. ivars->buf_max = num_elems; ivars->buf_tick = 0; return num_elems; }
VArray* PhraseCompiler_highlight_spans(PhraseCompiler *self, Searcher *searcher, DocVector *doc_vec, const CharBuf *field) { PhraseQuery *const parent = (PhraseQuery*)self->parent; VArray *const terms = parent->terms; VArray *const spans = VA_new(0); VArray *term_vectors; BitVector *posit_vec; BitVector *other_posit_vec; uint32_t i; const uint32_t num_terms = VA_Get_Size(terms); uint32_t num_tvs; UNUSED_VAR(searcher); // Bail if no terms or field doesn't match. if (!num_terms) { return spans; } if (!CB_Equals(field, (Obj*)parent->field)) { return spans; } term_vectors = VA_new(num_terms); posit_vec = BitVec_new(0); other_posit_vec = BitVec_new(0); for (i = 0; i < num_terms; i++) { Obj *term = VA_Fetch(terms, i); TermVector *term_vector = DocVec_Term_Vector(doc_vec, field, (CharBuf*)term); // Bail if any term is missing. if (!term_vector) break; VA_Push(term_vectors, (Obj*)term_vector); if (i == 0) { // Set initial positions from first term. uint32_t j; I32Array *positions = TV_Get_Positions(term_vector); for (j = I32Arr_Get_Size(positions); j > 0; j--) { BitVec_Set(posit_vec, I32Arr_Get(positions, j - 1)); } } else { // Filter positions using logical "and". uint32_t j; I32Array *positions = TV_Get_Positions(term_vector); BitVec_Clear_All(other_posit_vec); for (j = I32Arr_Get_Size(positions); j > 0; j--) { int32_t pos = I32Arr_Get(positions, j - 1) - i; if (pos >= 0) { BitVec_Set(other_posit_vec, pos); } } BitVec_And(posit_vec, other_posit_vec); } } // Proceed only if all terms are present. num_tvs = VA_Get_Size(term_vectors); if (num_tvs == num_terms) { TermVector *first_tv = (TermVector*)VA_Fetch(term_vectors, 0); TermVector *last_tv = (TermVector*)VA_Fetch(term_vectors, num_tvs - 1); I32Array *tv_start_positions = TV_Get_Positions(first_tv); I32Array *tv_end_positions = TV_Get_Positions(last_tv); I32Array *tv_start_offsets = TV_Get_Start_Offsets(first_tv); I32Array *tv_end_offsets = TV_Get_End_Offsets(last_tv); uint32_t terms_max = num_terms - 1; I32Array *valid_posits = BitVec_To_Array(posit_vec); uint32_t num_valid_posits = I32Arr_Get_Size(valid_posits); uint32_t j = 0; uint32_t posit_tick; float weight = PhraseCompiler_Get_Weight(self); i = 0; // Add only those starts/ends that belong to a valid position. for (posit_tick = 0; posit_tick < num_valid_posits; posit_tick++) { int32_t valid_start_posit = I32Arr_Get(valid_posits, posit_tick); int32_t valid_end_posit = valid_start_posit + terms_max; int32_t start_offset = 0, end_offset = 0; uint32_t max; for (max = I32Arr_Get_Size(tv_start_positions); i < max; i++) { if (I32Arr_Get(tv_start_positions, i) == valid_start_posit) { start_offset = I32Arr_Get(tv_start_offsets, i); break; } } for (max = I32Arr_Get_Size(tv_end_positions); j < max; j++) { if (I32Arr_Get(tv_end_positions, j) == valid_end_posit) { end_offset = I32Arr_Get(tv_end_offsets, j); break; } } VA_Push(spans, (Obj*)Span_new(start_offset, end_offset - start_offset, weight) ); i++, j++; } DECREF(valid_posits); } DECREF(other_posit_vec); DECREF(posit_vec); DECREF(term_vectors); return spans; }