static SeriesMatcher* S_make_series_matcher(I32Array *doc_ids, I32Array *offsets, int32_t doc_max) { int32_t num_doc_ids = I32Arr_Get_Size(doc_ids); int32_t num_matchers = I32Arr_Get_Size(offsets); VArray *matchers = VA_new(num_matchers); int32_t tick = 0; int32_t i; // Divvy up doc_ids by segment into BitVectors. for (i = 0; i < num_matchers; i++) { int32_t offset = I32Arr_Get(offsets, i); int32_t max = i == num_matchers - 1 ? doc_max + 1 : I32Arr_Get(offsets, i + 1); BitVector *bit_vec = BitVec_new(max - offset); while (tick < num_doc_ids) { int32_t doc_id = I32Arr_Get(doc_ids, tick); if (doc_id > max) { break; } else { tick++; } BitVec_Set(bit_vec, doc_id - offset); } VA_Push(matchers, (Obj*)BitVecMatcher_new(bit_vec)); DECREF(bit_vec); } SeriesMatcher *series_matcher = SeriesMatcher_new(matchers, offsets); DECREF(matchers); return series_matcher; }
uint32_t PolyReader_sub_tick(I32Array *offsets, int32_t doc_id) { int32_t size = I32Arr_Get_Size(offsets); if (size == 0) { return 0; } int32_t lo = -1; int32_t hi = size; while (hi - lo > 1) { int32_t mid = lo + ((hi - lo) / 2); int32_t offset = I32Arr_Get(offsets, mid); if (doc_id <= offset) { hi = mid; } else { lo = mid; } } if (hi == size) { hi--; } while (hi > 0) { int32_t offset = I32Arr_Get(offsets, hi); if (doc_id <= offset) { hi--; } else { break; } } return hi; }
uint32_t IxManager_Choose_Sparse_IMP(IndexManager *self, I32Array *doc_counts) { UNUSED_VAR(self); uint32_t threshold = 0; int32_t total_docs = 0; const uint32_t num_candidates = (uint32_t)I32Arr_Get_Size(doc_counts); // Find sparsely populated segments. for (uint32_t i = 0; i < num_candidates; i++) { uint32_t num_segs_when_done = num_candidates - threshold + 1; total_docs += I32Arr_Get(doc_counts, i); if (total_docs < (int32_t)S_fibonacci(num_segs_when_done + 5)) { threshold = i + 1; } } // If recycling, try not to get stuck merging the same big segment over // and over on small commits. if (threshold == 1 && num_candidates > 2) { int32_t this_seg_doc_count = I32Arr_Get(doc_counts, 0); int32_t next_seg_doc_count = I32Arr_Get(doc_counts, 1); // Try to merge 2 segments worth of stuff, so long as the next segment // is less than double the size. if (next_seg_doc_count / 2 < this_seg_doc_count) { threshold = 2; } } return threshold; }
Vector* TermCompiler_Highlight_Spans_IMP(TermCompiler *self, Searcher *searcher, DocVector *doc_vec, String *field) { TermCompilerIVARS *const ivars = TermCompiler_IVARS(self); TermQueryIVARS *const parent_ivars = TermQuery_IVARS((TermQuery*)ivars->parent); Vector *spans = Vec_new(0); TermVector *term_vector; I32Array *starts, *ends; UNUSED_VAR(searcher); if (!Str_Equals(parent_ivars->field, (Obj*)field)) { return spans; } // Add all starts and ends. term_vector = DocVec_Term_Vector(doc_vec, field, (String*)parent_ivars->term); if (!term_vector) { return spans; } starts = TV_Get_Start_Offsets(term_vector); ends = TV_Get_End_Offsets(term_vector); for (size_t i = 0, max = I32Arr_Get_Size(starts); i < max; i++) { int32_t start = I32Arr_Get(starts, i); int32_t length = I32Arr_Get(ends, i) - start; Vec_Push(spans, (Obj*)Span_new(start, length, TermCompiler_Get_Weight(self))); } DECREF(term_vector); return spans; }
// Adjust current doc id. We create our own doc_count rather than rely on // SegReader's number because the DeletionsWriter and the SegReader are // probably out of sync. static void S_adjust_doc_id(SegWriter *self, SegReader *reader, I32Array *doc_map) { SegWriterIVARS *const ivars = SegWriter_IVARS(self); int32_t doc_count = SegReader_Doc_Max(reader); for (size_t i = 1, max = I32Arr_Get_Size(doc_map); i < max; i++) { if (I32Arr_Get(doc_map, i) == 0) { doc_count--; } } Seg_Increment_Count(ivars->segment, doc_count); }
MockMatcher* MockMatcher_init(MockMatcher *self, I32Array *doc_ids, ByteBuf *scores) { Matcher_init((Matcher*)self); MockMatcherIVARS *const ivars = MockMatcher_IVARS(self); ivars->tick = -1; ivars->size = I32Arr_Get_Size(doc_ids); ivars->doc_ids = (I32Array*)INCREF(doc_ids); ivars->scores = (ByteBuf*)INCREF(scores); return self; }
TermVector* TV_init(TermVector *self, const CharBuf *field, const CharBuf *text, I32Array *positions, I32Array *start_offsets, I32Array *end_offsets) { /* Assign. */ self->field = CB_Clone(field); self->text = CB_Clone(text); self->num_pos = I32Arr_Get_Size(positions); self->positions = (I32Array*)INCREF(positions); self->start_offsets = (I32Array*)INCREF(start_offsets); self->end_offsets = (I32Array*)INCREF(end_offsets); if ( I32Arr_Get_Size(start_offsets) != self->num_pos || I32Arr_Get_Size(end_offsets) != self->num_pos ) { THROW("Unbalanced arrays: %u32 %u32 %u32", self->num_pos, I32Arr_Get_Size(start_offsets), I32Arr_Get_Size(end_offsets)); } return self; }
TermVector* TV_init(TermVector *self, String *field, String *text, I32Array *positions, I32Array *start_offsets, I32Array *end_offsets) { TermVectorIVARS *const ivars = TV_IVARS(self); // Assign. ivars->field = Str_Clone(field); ivars->text = Str_Clone(text); ivars->num_pos = I32Arr_Get_Size(positions); ivars->positions = (I32Array*)INCREF(positions); ivars->start_offsets = (I32Array*)INCREF(start_offsets); ivars->end_offsets = (I32Array*)INCREF(end_offsets); if (I32Arr_Get_Size(start_offsets) != ivars->num_pos || I32Arr_Get_Size(end_offsets) != ivars->num_pos ) { THROW(ERR, "Unbalanced arrays: %u64 %u64 %u64", (uint64_t)ivars->num_pos, (uint64_t)I32Arr_Get_Size(start_offsets), (uint64_t)I32Arr_Get_Size(end_offsets)); } return self; }
SeriesMatcher* SeriesMatcher_init(SeriesMatcher *self, VArray *matchers, I32Array *offsets) { Matcher_init((Matcher*)self); /* Init. */ self->current_matcher = NULL; self->current_offset = 0; self->next_offset = 0; self->doc_id = 0; self->tick = 0; /* Assign. */ self->matchers = (VArray*)INCREF(matchers); self->offsets = (I32Array*)INCREF(offsets); /* Derive. */ self->num_matchers = (i32_t)I32Arr_Get_Size(offsets); return self; }
static void S_do_test_matrix(TestBatch *batch, int32_t doc_max, int32_t first_doc_id, int32_t doc_inc, int32_t offset_inc) { I32Array *doc_ids = S_generate_match_list(first_doc_id, doc_max, doc_inc); I32Array *offsets = S_generate_match_list(0, doc_max, offset_inc); SeriesMatcher *series_matcher = S_make_series_matcher(doc_ids, offsets, doc_max); uint32_t num_in_agreement = 0; int32_t got; while (0 != (got = SeriesMatcher_Next(series_matcher))) { if (got != I32Arr_Get(doc_ids, num_in_agreement)) { break; } num_in_agreement++; } TEST_INT_EQ(batch, num_in_agreement, I32Arr_Get_Size(doc_ids), "doc_max=%d first_doc_id=%d doc_inc=%d offset_inc=%d", doc_max, first_doc_id, doc_inc, offset_inc); DECREF(doc_ids); DECREF(offsets); DECREF(series_matcher); }
VArray* PhraseCompiler_highlight_spans(PhraseCompiler *self, Searcher *searcher, DocVector *doc_vec, const CharBuf *field) { PhraseQuery *const parent = (PhraseQuery*)self->parent; VArray *const terms = parent->terms; VArray *const spans = VA_new(0); VArray *term_vectors; BitVector *posit_vec; BitVector *other_posit_vec; uint32_t i; const uint32_t num_terms = VA_Get_Size(terms); uint32_t num_tvs; UNUSED_VAR(searcher); // Bail if no terms or field doesn't match. if (!num_terms) { return spans; } if (!CB_Equals(field, (Obj*)parent->field)) { return spans; } term_vectors = VA_new(num_terms); posit_vec = BitVec_new(0); other_posit_vec = BitVec_new(0); for (i = 0; i < num_terms; i++) { Obj *term = VA_Fetch(terms, i); TermVector *term_vector = DocVec_Term_Vector(doc_vec, field, (CharBuf*)term); // Bail if any term is missing. if (!term_vector) break; VA_Push(term_vectors, (Obj*)term_vector); if (i == 0) { // Set initial positions from first term. uint32_t j; I32Array *positions = TV_Get_Positions(term_vector); for (j = I32Arr_Get_Size(positions); j > 0; j--) { BitVec_Set(posit_vec, I32Arr_Get(positions, j - 1)); } } else { // Filter positions using logical "and". uint32_t j; I32Array *positions = TV_Get_Positions(term_vector); BitVec_Clear_All(other_posit_vec); for (j = I32Arr_Get_Size(positions); j > 0; j--) { int32_t pos = I32Arr_Get(positions, j - 1) - i; if (pos >= 0) { BitVec_Set(other_posit_vec, pos); } } BitVec_And(posit_vec, other_posit_vec); } } // Proceed only if all terms are present. num_tvs = VA_Get_Size(term_vectors); if (num_tvs == num_terms) { TermVector *first_tv = (TermVector*)VA_Fetch(term_vectors, 0); TermVector *last_tv = (TermVector*)VA_Fetch(term_vectors, num_tvs - 1); I32Array *tv_start_positions = TV_Get_Positions(first_tv); I32Array *tv_end_positions = TV_Get_Positions(last_tv); I32Array *tv_start_offsets = TV_Get_Start_Offsets(first_tv); I32Array *tv_end_offsets = TV_Get_End_Offsets(last_tv); uint32_t terms_max = num_terms - 1; I32Array *valid_posits = BitVec_To_Array(posit_vec); uint32_t num_valid_posits = I32Arr_Get_Size(valid_posits); uint32_t j = 0; uint32_t posit_tick; float weight = PhraseCompiler_Get_Weight(self); i = 0; // Add only those starts/ends that belong to a valid position. for (posit_tick = 0; posit_tick < num_valid_posits; posit_tick++) { int32_t valid_start_posit = I32Arr_Get(valid_posits, posit_tick); int32_t valid_end_posit = valid_start_posit + terms_max; int32_t start_offset = 0, end_offset = 0; uint32_t max; for (max = I32Arr_Get_Size(tv_start_positions); i < max; i++) { if (I32Arr_Get(tv_start_positions, i) == valid_start_posit) { start_offset = I32Arr_Get(tv_start_offsets, i); break; } } for (max = I32Arr_Get_Size(tv_end_positions); j < max; j++) { if (I32Arr_Get(tv_end_positions, j) == valid_end_posit) { end_offset = I32Arr_Get(tv_end_offsets, j); break; } } VA_Push(spans, (Obj*)Span_new(start_offset, end_offset - start_offset, weight) ); i++, j++; } DECREF(valid_posits); } DECREF(other_posit_vec); DECREF(posit_vec); DECREF(term_vectors); return spans; }