TermVector* TV_deserialize(TermVector *self, InStream *instream) { u32_t i; CharBuf *field = (CharBuf*)CB_deserialize(NULL, instream); CharBuf *text = (CharBuf*)CB_deserialize(NULL, instream); u32_t num_pos = InStream_Read_C32(instream); i32_t *posits, *starts, *ends; I32Array *positions, *start_offsets, *end_offsets; /* Read positional data. */ posits = MALLOCATE(num_pos, i32_t); starts = MALLOCATE(num_pos, i32_t); ends = MALLOCATE(num_pos, i32_t); for (i = 0; i < num_pos; i++) { posits[i] = InStream_Read_C32(instream); starts[i] = InStream_Read_C32(instream); ends[i] = InStream_Read_C32(instream); } positions = I32Arr_new_steal(posits, num_pos); start_offsets = I32Arr_new_steal(starts, num_pos); end_offsets = I32Arr_new_steal(ends, num_pos); self = self ? self : (TermVector*)VTable_Make_Obj(&TERMVECTOR); self = TV_init(self, field, text, positions, start_offsets, end_offsets); DECREF(positions); DECREF(start_offsets); DECREF(end_offsets); DECREF(text); DECREF(field); return self; }
TermVector* TV_Deserialize_IMP(TermVector *self, InStream *instream) { String *field = Freezer_read_string(instream); String *text = Freezer_read_string(instream); size_t num_pos = InStream_Read_C64(instream); // Read positional data. int32_t *posits = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); int32_t *starts = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); int32_t *ends = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); for (size_t i = 0; i < num_pos; i++) { posits[i] = InStream_Read_C32(instream); starts[i] = InStream_Read_C32(instream); ends[i] = InStream_Read_C32(instream); } I32Array *positions = I32Arr_new_steal(posits, num_pos); I32Array *start_offsets = I32Arr_new_steal(starts, num_pos); I32Array *end_offsets = I32Arr_new_steal(ends, num_pos); TV_init(self, field, text, positions, start_offsets, end_offsets); DECREF(positions); DECREF(start_offsets); DECREF(end_offsets); DECREF(text); DECREF(field); return self; }
PolySearcher* PolySearcher_init(PolySearcher *self, Schema *schema, VArray *searchers) { const uint32_t num_searchers = VA_Get_Size(searchers); int32_t *starts_array = (int32_t*)MALLOCATE(num_searchers * sizeof(int32_t)); int32_t doc_max = 0; Searcher_init((Searcher*)self, schema); PolySearcherIVARS *const ivars = PolySearcher_IVARS(self); ivars->searchers = (VArray*)INCREF(searchers); ivars->starts = NULL; // Safe cleanup. for (uint32_t i = 0; i < num_searchers; i++) { Searcher *searcher = (Searcher*)CERTIFY(VA_Fetch(searchers, i), SEARCHER); Schema *candidate = Searcher_Get_Schema(searcher); Class *orig_class = Schema_Get_Class(schema); Class *candidate_class = Schema_Get_Class(candidate); // Confirm that searchers all use the same schema. if (orig_class != candidate_class) { THROW(ERR, "Conflicting schemas: '%o', '%o'", Schema_Get_Class_Name(schema), Schema_Get_Class_Name(candidate)); } // Derive doc_max and relative start offsets. starts_array[i] = (int32_t)doc_max; doc_max += Searcher_Doc_Max(searcher); } ivars->doc_max = doc_max; ivars->starts = I32Arr_new_steal(starts_array, num_searchers); return self; }
I32Array* SegReader_offsets(SegReader *self) { i32_t *ints = CALLOCATE(1, i32_t); UNUSED_VAR(self); return I32Arr_new_steal(ints, 1); }
PolyReader* PolyReader_init(PolyReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, IndexManager *manager, VArray *sub_readers) { PolyReaderIVARS *const ivars = PolyReader_IVARS(self); ivars->doc_max = 0; ivars->del_count = 0; if (sub_readers) { uint32_t num_segs = VA_Get_Size(sub_readers); VArray *segments = VA_new(num_segs); for (uint32_t i = 0; i < num_segs; i++) { SegReader *seg_reader = (SegReader*)CERTIFY(VA_Fetch(sub_readers, i), SEGREADER); VA_Push(segments, INCREF(SegReader_Get_Segment(seg_reader))); } IxReader_init((IndexReader*)self, schema, folder, snapshot, segments, -1, manager); DECREF(segments); S_init_sub_readers(self, sub_readers); } else { IxReader_init((IndexReader*)self, schema, folder, snapshot, NULL, -1, manager); ivars->sub_readers = VA_new(0); ivars->offsets = I32Arr_new_steal(NULL, 0); } return self; }
Vector* IxManager_Recycle_IMP(IndexManager *self, PolyReader *reader, DeletionsWriter *del_writer, int64_t cutoff, bool optimize) { Vector *seg_readers = PolyReader_Get_Seg_Readers(reader); size_t num_seg_readers = Vec_Get_Size(seg_readers); SegReader **candidates = (SegReader**)MALLOCATE(num_seg_readers * sizeof(SegReader*)); size_t num_candidates = 0; for (size_t i = 0; i < num_seg_readers; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(seg_readers, i); if (SegReader_Get_Seg_Num(seg_reader) > cutoff) { candidates[num_candidates++] = seg_reader; } } Vector *recyclables = Vec_new(num_candidates); if (optimize) { for (size_t i = 0; i < num_candidates; i++) { Vec_Push(recyclables, INCREF(candidates[i])); } FREEMEM(candidates); return recyclables; } // Sort by ascending size in docs, choose sparsely populated segments. qsort(candidates, num_candidates, sizeof(SegReader*), S_compare_doc_count); int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t)); for (uint32_t i = 0; i < num_candidates; i++) { counts[i] = SegReader_Doc_Count(candidates[i]); } I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates); uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts); DECREF(doc_counts); // Move SegReaders to be recycled. for (uint32_t i = 0; i < threshold; i++) { Vec_Store(recyclables, i, INCREF(candidates[i])); } // Find segments where at least 10% of all docs have been deleted. for (uint32_t i = threshold; i < num_candidates; i++) { SegReader *seg_reader = candidates[i]; String *seg_name = SegReader_Get_Seg_Name(seg_reader); double doc_max = SegReader_Doc_Max(seg_reader); double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name); double del_proportion = num_deletions / doc_max; if (del_proportion >= 0.1) { Vec_Push(recyclables, INCREF(seg_reader)); } } FREEMEM(candidates); return recyclables; }
static TermVector* S_extract_tv_from_tv_buf(String *field, String *term_text, ByteBuf *tv_buf) { TermVector *retval = NULL; const char *posdata = BB_Get_Buf(tv_buf); const char *posdata_end = posdata + BB_Get_Size(tv_buf); int32_t *positions = NULL; int32_t *starts = NULL; int32_t *ends = NULL; uint32_t num_pos = 0; if (posdata != posdata_end) { num_pos = NumUtil_decode_c32(&posdata); positions = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); starts = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); ends = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t)); } // Expand C32s. for (uint32_t i = 0; i < num_pos; i++) { positions[i] = NumUtil_decode_c32(&posdata); starts[i] = NumUtil_decode_c32(&posdata); ends[i] = NumUtil_decode_c32(&posdata); } if (posdata != posdata_end) { THROW(ERR, "Bad encoding of posdata"); } else { I32Array *posits_map = I32Arr_new_steal(positions, num_pos); I32Array *starts_map = I32Arr_new_steal(starts, num_pos); I32Array *ends_map = I32Arr_new_steal(ends, num_pos); retval = TV_new(field, term_text, posits_map, starts_map, ends_map); DECREF(posits_map); DECREF(starts_map); DECREF(ends_map); } return retval; }
static void S_init_sub_readers(PolyReader *self, VArray *sub_readers) { PolyReaderIVARS *const ivars = PolyReader_IVARS(self); uint32_t num_sub_readers = VA_Get_Size(sub_readers); int32_t *starts = (int32_t*)MALLOCATE(num_sub_readers * sizeof(int32_t)); Hash *data_readers = Hash_new(0); DECREF(ivars->sub_readers); DECREF(ivars->offsets); ivars->sub_readers = (VArray*)INCREF(sub_readers); // Accumulate doc_max, subreader start offsets, and DataReaders. ivars->doc_max = 0; for (uint32_t i = 0; i < num_sub_readers; i++) { SegReader *seg_reader = (SegReader*)VA_Fetch(sub_readers, i); Hash *components = SegReader_Get_Components(seg_reader); CharBuf *api; DataReader *component; starts[i] = ivars->doc_max; ivars->doc_max += SegReader_Doc_Max(seg_reader); Hash_Iterate(components); while (Hash_Next(components, (Obj**)&api, (Obj**)&component)) { VArray *readers = (VArray*)Hash_Fetch(data_readers, (Obj*)api); if (!readers) { readers = VA_new(num_sub_readers); Hash_Store(data_readers, (Obj*)api, (Obj*)readers); } VA_Store(readers, i, INCREF(component)); } } ivars->offsets = I32Arr_new_steal(starts, num_sub_readers); CharBuf *api; VArray *readers; Hash_Iterate(data_readers); while (Hash_Next(data_readers, (Obj**)&api, (Obj**)&readers)) { DataReader *datareader = (DataReader*)CERTIFY(S_first_non_null(readers), DATAREADER); DataReader *aggregator = DataReader_Aggregator(datareader, readers, ivars->offsets); if (aggregator) { CERTIFY(aggregator, DATAREADER); Hash_Store(ivars->components, (Obj*)api, (Obj*)aggregator); } } DECREF(data_readers); DeletionsReader *del_reader = (DeletionsReader*)Hash_Fetch( ivars->components, (Obj*)VTable_Get_Name(DELETIONSREADER)); ivars->del_count = del_reader ? DelReader_Del_Count(del_reader) : 0; }
static I32Array* S_generate_match_list(int32_t first, int32_t max, int32_t doc_inc) { int32_t count = (int32_t)ceil(((float)max - first) / doc_inc); int32_t *doc_ids = (int32_t*)MALLOCATE(count * sizeof(int32_t)); int32_t doc_id = first; int32_t i = 0; for (; doc_id < max; doc_id += doc_inc, i++) { doc_ids[i] = doc_id; } if (i != count) { THROW(ERR, "Screwed up somehow: %i32 %i32", i, count); } return I32Arr_new_steal(doc_ids, count); }
VArray* IxManager_recycle(IndexManager *self, PolyReader *reader, DeletionsWriter *del_writer, int64_t cutoff, bool_t optimize) { VArray *seg_readers = PolyReader_Get_Seg_Readers(reader); VArray *candidates = VA_Gather(seg_readers, S_check_cutoff, &cutoff); VArray *recyclables = VA_new(VA_Get_Size(candidates)); const uint32_t num_candidates = VA_Get_Size(candidates); if (optimize) { DECREF(recyclables); return candidates; } // Sort by ascending size in docs, choose sparsely populated segments. VA_Sort(candidates, S_compare_doc_count, NULL); int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t)); for (uint32_t i = 0; i < num_candidates; i++) { SegReader *seg_reader = (SegReader*)CERTIFY( VA_Fetch(candidates, i), SEGREADER); counts[i] = SegReader_Doc_Count(seg_reader); } I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates); uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts); DECREF(doc_counts); // Move SegReaders to be recycled. for (uint32_t i = 0; i < threshold; i++) { VA_Store(recyclables, i, VA_Delete(candidates, i)); } // Find segments where at least 10% of all docs have been deleted. for (uint32_t i = threshold; i < num_candidates; i++) { SegReader *seg_reader = (SegReader*)VA_Delete(candidates, i); CharBuf *seg_name = SegReader_Get_Seg_Name(seg_reader); double doc_max = SegReader_Doc_Max(seg_reader); double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name); double del_proportion = num_deletions / doc_max; if (del_proportion >= 0.1) { VA_Push(recyclables, (Obj*)seg_reader); } else { DECREF(seg_reader); } } DECREF(candidates); return recyclables; }
I32Array* DelWriter_Generate_Doc_Map_IMP(DeletionsWriter *self, Matcher *deletions, int32_t doc_max, int32_t offset) { int32_t *doc_map = (int32_t*)CALLOCATE(doc_max + 1, sizeof(int32_t)); int32_t next_deletion = deletions ? Matcher_Next(deletions) : INT32_MAX; UNUSED_VAR(self); // 0 for a deleted doc, a new number otherwise for (int32_t i = 1, new_doc_id = 1; i <= doc_max; i++) { if (i == next_deletion) { next_deletion = Matcher_Next(deletions); } else { doc_map[i] = offset + new_doc_id++; } } return I32Arr_new_steal(doc_map, doc_max + 1); }
I32Array* SegReader_Offsets_IMP(SegReader *self) { int32_t *ints = (int32_t*)CALLOCATE(1, sizeof(int32_t)); UNUSED_VAR(self); return I32Arr_new_steal(ints, 1); }