VArray* IxManager_recycle(IndexManager *self, PolyReader *reader, DeletionsWriter *del_writer, int64_t cutoff, bool_t optimize) { VArray *seg_readers = PolyReader_Get_Seg_Readers(reader); VArray *candidates = VA_Gather(seg_readers, S_check_cutoff, &cutoff); VArray *recyclables = VA_new(VA_Get_Size(candidates)); const uint32_t num_candidates = VA_Get_Size(candidates); if (optimize) { DECREF(recyclables); return candidates; } // Sort by ascending size in docs, choose sparsely populated segments. VA_Sort(candidates, S_compare_doc_count, NULL); int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t)); for (uint32_t i = 0; i < num_candidates; i++) { SegReader *seg_reader = (SegReader*)CERTIFY( VA_Fetch(candidates, i), SEGREADER); counts[i] = SegReader_Doc_Count(seg_reader); } I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates); uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts); DECREF(doc_counts); // Move SegReaders to be recycled. for (uint32_t i = 0; i < threshold; i++) { VA_Store(recyclables, i, VA_Delete(candidates, i)); } // Find segments where at least 10% of all docs have been deleted. for (uint32_t i = threshold; i < num_candidates; i++) { SegReader *seg_reader = (SegReader*)VA_Delete(candidates, i); CharBuf *seg_name = SegReader_Get_Seg_Name(seg_reader); double doc_max = SegReader_Doc_Max(seg_reader); double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name); double del_proportion = num_deletions / doc_max; if (del_proportion >= 0.1) { VA_Push(recyclables, (Obj*)seg_reader); } else { DECREF(seg_reader); } } DECREF(candidates); return recyclables; }
void SortColl_collect(SortCollector *self, int32_t doc_id) { SortCollectorIVARS *const ivars = SortColl_IVARS(self); // Add to the total number of hits. ivars->total_hits++; // Collect this hit if it's competitive. if (SI_competitive(ivars, doc_id)) { MatchDoc *const match_doc = ivars->bumped; MatchDocIVARS *const match_doc_ivars = MatchDoc_IVARS(match_doc); match_doc_ivars->doc_id = doc_id + ivars->base; if (ivars->need_score && match_doc_ivars->score == F32_NEGINF) { match_doc_ivars->score = Matcher_Score(ivars->matcher); } // Fetch values so that cross-segment sorting can work. if (ivars->need_values) { VArray *values = match_doc_ivars->values; for (uint32_t i = 0, max = ivars->num_rules; i < max; i++) { SortCache *cache = ivars->sort_caches[i]; Obj *old_val = (Obj*)VA_Delete(values, i); if (cache) { int32_t ord = SortCache_Ordinal(cache, doc_id); Obj *blank = old_val ? old_val : SortCache_Make_Blank(cache); Obj *val = SortCache_Value(cache, ord, blank); if (val) { VA_Store(values, i, (Obj*)val); } else { DECREF(blank); } } } } // Insert the new MatchDoc. ivars->bumped = (MatchDoc*)HitQ_Jostle(ivars->hit_q, (Obj*)match_doc); if (ivars->bumped) { if (ivars->bumped == match_doc) { /* The queue is full, and we have established a threshold for * this segment as to what sort of document is definitely not * acceptable. Turn off AUTO_ACCEPT and start actually * testing whether hits are competitive. */ ivars->bubble_score = match_doc_ivars->score; ivars->bubble_doc = doc_id; ivars->actions = ivars->derived_actions; } // Recycle. MatchDoc_IVARS(ivars->bumped)->score = ivars->need_score ? F32_NEGINF : F32_NAN; } else { // The queue isn't full yet, so create a fresh MatchDoc. VArray *values = ivars->need_values ? VA_new(ivars->num_rules) : NULL; float fake_score = ivars->need_score ? F32_NEGINF : F32_NAN; ivars->bumped = MatchDoc_new(INT32_MAX, fake_score, values); DECREF(values); } } }
void SortWriter_finish(SortWriter *self) { SortWriterIVARS *const ivars = SortWriter_IVARS(self); VArray *const field_writers = ivars->field_writers; // If we have no data, bail out. if (!ivars->temp_ord_out) { return; } // If we've either flushed or added segments, flush everything so that any // one field can use the entire margin up to mem_thresh. if (ivars->flush_at_finish) { for (uint32_t i = 1, max = VA_Get_Size(field_writers); i < max; i++) { SortFieldWriter *field_writer = (SortFieldWriter*)VA_Fetch(field_writers, i); if (field_writer) { SortFieldWriter_Flush(field_writer); } } } // Close down temp streams. OutStream_Close(ivars->temp_ord_out); OutStream_Close(ivars->temp_ix_out); OutStream_Close(ivars->temp_dat_out); for (uint32_t i = 1, max = VA_Get_Size(field_writers); i < max; i++) { SortFieldWriter *field_writer = (SortFieldWriter*)VA_Delete(field_writers, i); if (field_writer) { CharBuf *field = Seg_Field_Name(ivars->segment, i); SortFieldWriter_Flip(field_writer); int32_t count = SortFieldWriter_Finish(field_writer); Hash_Store(ivars->counts, (Obj*)field, (Obj*)CB_newf("%i32", count)); int32_t null_ord = SortFieldWriter_Get_Null_Ord(field_writer); if (null_ord != -1) { Hash_Store(ivars->null_ords, (Obj*)field, (Obj*)CB_newf("%i32", null_ord)); } int32_t ord_width = SortFieldWriter_Get_Ord_Width(field_writer); Hash_Store(ivars->ord_widths, (Obj*)field, (Obj*)CB_newf("%i32", ord_width)); } DECREF(field_writer); } VA_Clear(field_writers); // Store metadata. Seg_Store_Metadata_Str(ivars->segment, "sort", 4, (Obj*)SortWriter_Metadata(self)); // Clean up. Folder *folder = ivars->folder; CharBuf *seg_name = Seg_Get_Name(ivars->segment); CharBuf *path = CB_newf("%o/sort_ord_temp", seg_name); Folder_Delete(folder, path); CB_setf(path, "%o/sort_ix_temp", seg_name); Folder_Delete(folder, path); CB_setf(path, "%o/sort_dat_temp", seg_name); Folder_Delete(folder, path); DECREF(path); }