void Indexer_Prepare_Commit_IMP(Indexer *self) { IndexerIVARS *const ivars = Indexer_IVARS(self); Vector *seg_readers = PolyReader_Get_Seg_Readers(ivars->polyreader); size_t num_seg_readers = Vec_Get_Size(seg_readers); bool merge_happened = false; if (!ivars->write_lock || ivars->prepared) { THROW(ERR, "Can't call Prepare_Commit() more than once"); } // Merge existing index data. if (num_seg_readers) { merge_happened = S_maybe_merge(self, seg_readers); } // Add a new segment and write a new snapshot file if... if (Seg_Get_Count(ivars->segment) // Docs/segs added. || merge_happened // Some segs merged. || !Snapshot_Num_Entries(ivars->snapshot) // Initializing index. || DelWriter_Updated(ivars->del_writer) ) { Folder *folder = ivars->folder; Schema *schema = ivars->schema; Snapshot *snapshot = ivars->snapshot; // Derive snapshot and schema file names. DECREF(ivars->snapfile); String *snapfile = IxManager_Make_Snapshot_Filename(ivars->manager); ivars->snapfile = Str_Cat_Trusted_Utf8(snapfile, ".temp", 5); DECREF(snapfile); uint64_t schema_gen = IxFileNames_extract_gen(ivars->snapfile); char base36[StrHelp_MAX_BASE36_BYTES]; StrHelp_to_base36(schema_gen, &base36); String *new_schema_name = Str_newf("schema_%s.json", base36); // Finish the segment, write schema file. SegWriter_Finish(ivars->seg_writer); Schema_Write(schema, folder, new_schema_name); String *old_schema_name = S_find_schema_file(snapshot); if (old_schema_name) { Snapshot_Delete_Entry(snapshot, old_schema_name); } Snapshot_Add_Entry(snapshot, new_schema_name); DECREF(new_schema_name); // Write temporary snapshot file. Folder_Delete(folder, ivars->snapfile); Snapshot_Write_File(snapshot, folder, ivars->snapfile); ivars->needs_commit = true; } // Close reader, so that we can delete its files if appropriate. PolyReader_Close(ivars->polyreader); ivars->prepared = true; }
Vector* IxManager_Recycle_IMP(IndexManager *self, PolyReader *reader, DeletionsWriter *del_writer, int64_t cutoff, bool optimize) { Vector *seg_readers = PolyReader_Get_Seg_Readers(reader); size_t num_seg_readers = Vec_Get_Size(seg_readers); SegReader **candidates = (SegReader**)MALLOCATE(num_seg_readers * sizeof(SegReader*)); size_t num_candidates = 0; for (size_t i = 0; i < num_seg_readers; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(seg_readers, i); if (SegReader_Get_Seg_Num(seg_reader) > cutoff) { candidates[num_candidates++] = seg_reader; } } Vector *recyclables = Vec_new(num_candidates); if (optimize) { for (size_t i = 0; i < num_candidates; i++) { Vec_Push(recyclables, INCREF(candidates[i])); } FREEMEM(candidates); return recyclables; } // Sort by ascending size in docs, choose sparsely populated segments. qsort(candidates, num_candidates, sizeof(SegReader*), S_compare_doc_count); int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t)); for (uint32_t i = 0; i < num_candidates; i++) { counts[i] = SegReader_Doc_Count(candidates[i]); } I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates); uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts); DECREF(doc_counts); // Move SegReaders to be recycled. for (uint32_t i = 0; i < threshold; i++) { Vec_Store(recyclables, i, INCREF(candidates[i])); } // Find segments where at least 10% of all docs have been deleted. for (uint32_t i = threshold; i < num_candidates; i++) { SegReader *seg_reader = candidates[i]; String *seg_name = SegReader_Get_Seg_Name(seg_reader); double doc_max = SegReader_Doc_Max(seg_reader); double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name); double del_proportion = num_deletions / doc_max; if (del_proportion >= 0.1) { Vec_Push(recyclables, INCREF(seg_reader)); } } FREEMEM(candidates); return recyclables; }
VArray* IxManager_recycle(IndexManager *self, PolyReader *reader, DeletionsWriter *del_writer, int64_t cutoff, bool_t optimize) { VArray *seg_readers = PolyReader_Get_Seg_Readers(reader); VArray *candidates = VA_Gather(seg_readers, S_check_cutoff, &cutoff); VArray *recyclables = VA_new(VA_Get_Size(candidates)); const uint32_t num_candidates = VA_Get_Size(candidates); if (optimize) { DECREF(recyclables); return candidates; } // Sort by ascending size in docs, choose sparsely populated segments. VA_Sort(candidates, S_compare_doc_count, NULL); int32_t *counts = (int32_t*)MALLOCATE(num_candidates * sizeof(int32_t)); for (uint32_t i = 0; i < num_candidates; i++) { SegReader *seg_reader = (SegReader*)CERTIFY( VA_Fetch(candidates, i), SEGREADER); counts[i] = SegReader_Doc_Count(seg_reader); } I32Array *doc_counts = I32Arr_new_steal(counts, num_candidates); uint32_t threshold = IxManager_Choose_Sparse(self, doc_counts); DECREF(doc_counts); // Move SegReaders to be recycled. for (uint32_t i = 0; i < threshold; i++) { VA_Store(recyclables, i, VA_Delete(candidates, i)); } // Find segments where at least 10% of all docs have been deleted. for (uint32_t i = threshold; i < num_candidates; i++) { SegReader *seg_reader = (SegReader*)VA_Delete(candidates, i); CharBuf *seg_name = SegReader_Get_Seg_Name(seg_reader); double doc_max = SegReader_Doc_Max(seg_reader); double num_deletions = DelWriter_Seg_Del_Count(del_writer, seg_name); double del_proportion = num_deletions / doc_max; if (del_proportion >= 0.1) { VA_Push(recyclables, (Obj*)seg_reader); } else { DECREF(seg_reader); } } DECREF(candidates); return recyclables; }
void BGMerger_Prepare_Commit_IMP(BackgroundMerger *self) { BackgroundMergerIVARS *const ivars = BGMerger_IVARS(self); Vector *seg_readers = PolyReader_Get_Seg_Readers(ivars->polyreader); uint32_t num_seg_readers = Vec_Get_Size(seg_readers); uint32_t segs_merged = 0; if (ivars->prepared) { THROW(ERR, "Can't call Prepare_Commit() more than once"); } // Maybe merge existing index data. if (num_seg_readers) { segs_merged = S_maybe_merge(self); } if (!segs_merged) { // Nothing merged. Leave `needs_commit` false and bail out. ivars->prepared = true; return; } // Finish the segment and write a new snapshot file. else { Folder *folder = ivars->folder; Snapshot *snapshot = ivars->snapshot; // Write out new deletions. if (DelWriter_Updated(ivars->del_writer)) { // Only write out if they haven't all been applied. if (segs_merged != num_seg_readers) { DelWriter_Finish(ivars->del_writer); } } // Finish the segment. SegWriter_Finish(ivars->seg_writer); // Grab the write lock. S_obtain_write_lock(self); if (!ivars->write_lock) { RETHROW(INCREF(Err_get_error())); } // Write temporary snapshot file. DECREF(ivars->snapfile); String *snapfile = IxManager_Make_Snapshot_Filename(ivars->manager); ivars->snapfile = Str_Cat_Trusted_Utf8(snapfile, ".temp", 5); DECREF(snapfile); Folder_Delete(folder, ivars->snapfile); Snapshot_Write_File(snapshot, folder, ivars->snapfile); // Determine whether the index has been updated while this background // merge process was running. String *start_snapfile = Snapshot_Get_Path(PolyReader_Get_Snapshot(ivars->polyreader)); Snapshot *latest_snapshot = Snapshot_Read_File(Snapshot_new(), ivars->folder, NULL); String *latest_snapfile = Snapshot_Get_Path(latest_snapshot); bool index_updated = !Str_Equals(start_snapfile, (Obj*)latest_snapfile); if (index_updated) { /* See if new deletions have been applied since this * background merge process started against any of the * segments we just merged away. If that's true, we need to * write another segment which applies the deletions against * the new composite segment. */ S_merge_updated_deletions(self); // Add the fresh content to our snapshot. (It's important to // run this AFTER S_merge_updated_deletions, because otherwise // we couldn't tell whether the deletion counts changed.) Vector *files = Snapshot_List(latest_snapshot); for (uint32_t i = 0, max = Vec_Get_Size(files); i < max; i++) { String *file = (String*)Vec_Fetch(files, i); if (Str_Starts_With_Utf8(file, "seg_", 4)) { int64_t gen = (int64_t)IxFileNames_extract_gen(file); if (gen > ivars->cutoff) { Snapshot_Add_Entry(ivars->snapshot, file); } } } DECREF(files); // Since the snapshot content has changed, we need to rewrite it. Folder_Delete(folder, ivars->snapfile); Snapshot_Write_File(snapshot, folder, ivars->snapfile); } DECREF(latest_snapshot); ivars->needs_commit = true; } // Close reader, so that we can delete its files if appropriate. PolyReader_Close(ivars->polyreader); ivars->prepared = true; }
static bool S_merge_updated_deletions(BackgroundMerger *self) { BackgroundMergerIVARS *const ivars = BGMerger_IVARS(self); Hash *updated_deletions = NULL; PolyReader *new_polyreader = PolyReader_open((Obj*)ivars->folder, NULL, NULL); Vector *new_seg_readers = PolyReader_Get_Seg_Readers(new_polyreader); Vector *old_seg_readers = PolyReader_Get_Seg_Readers(ivars->polyreader); Hash *new_segs = Hash_new(Vec_Get_Size(new_seg_readers)); for (uint32_t i = 0, max = Vec_Get_Size(new_seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(new_seg_readers, i); String *seg_name = SegReader_Get_Seg_Name(seg_reader); Hash_Store(new_segs, seg_name, INCREF(seg_reader)); } for (uint32_t i = 0, max = Vec_Get_Size(old_seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(old_seg_readers, i); String *seg_name = SegReader_Get_Seg_Name(seg_reader); // If this segment was merged away... if (Hash_Fetch(ivars->doc_maps, seg_name)) { SegReader *new_seg_reader = (SegReader*)CERTIFY( Hash_Fetch(new_segs, seg_name), SEGREADER); int32_t old_del_count = SegReader_Del_Count(seg_reader); int32_t new_del_count = SegReader_Del_Count(new_seg_reader); // ... were any new deletions applied against it? if (old_del_count != new_del_count) { DeletionsReader *del_reader = (DeletionsReader*)SegReader_Obtain( new_seg_reader, Class_Get_Name(DELETIONSREADER)); if (!updated_deletions) { updated_deletions = Hash_new(max); } Hash_Store(updated_deletions, seg_name, (Obj*)DelReader_Iterator(del_reader)); } } } DECREF(new_polyreader); DECREF(new_segs); if (!updated_deletions) { return false; } else { PolyReader *merge_polyreader = PolyReader_open((Obj*)ivars->folder, ivars->snapshot, NULL); Vector *merge_seg_readers = PolyReader_Get_Seg_Readers(merge_polyreader); Snapshot *latest_snapshot = Snapshot_Read_File(Snapshot_new(), ivars->folder, NULL); int64_t new_seg_num = IxManager_Highest_Seg_Num(ivars->manager, latest_snapshot) + 1; Segment *new_segment = Seg_new(new_seg_num); SegWriter *seg_writer = SegWriter_new(ivars->schema, ivars->snapshot, new_segment, merge_polyreader); DeletionsWriter *del_writer = SegWriter_Get_Del_Writer(seg_writer); int64_t merge_seg_num = Seg_Get_Number(ivars->segment); uint32_t seg_tick = INT32_MAX; int32_t offset = INT32_MAX; SegWriter_Prep_Seg_Dir(seg_writer); for (uint32_t i = 0, max = Vec_Get_Size(merge_seg_readers); i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(merge_seg_readers, i); if (SegReader_Get_Seg_Num(seg_reader) == merge_seg_num) { I32Array *offsets = PolyReader_Offsets(merge_polyreader); seg_tick = i; offset = I32Arr_Get(offsets, seg_tick); DECREF(offsets); } } if (offset == INT32_MAX) { THROW(ERR, "Failed sanity check"); } HashIterator *iter = HashIter_new(updated_deletions); while (HashIter_Next(iter)) { String *seg_name = HashIter_Get_Key(iter); Matcher *deletions = (Matcher*)HashIter_Get_Value(iter); I32Array *doc_map = (I32Array*)CERTIFY( Hash_Fetch(ivars->doc_maps, seg_name), I32ARRAY); int32_t del; while (0 != (del = Matcher_Next(deletions))) { // Find the slot where the deleted doc resides in the // rewritten segment. If the doc was already deleted when we // were merging, do nothing. int32_t remapped = I32Arr_Get(doc_map, del); if (remapped) { // It's a new deletion, so carry it forward and zap it in // the rewritten segment. DelWriter_Delete_By_Doc_ID(del_writer, remapped + offset); } } } DECREF(iter); // Finish the segment and clean up. DelWriter_Finish(del_writer); SegWriter_Finish(seg_writer); DECREF(seg_writer); DECREF(new_segment); DECREF(latest_snapshot); DECREF(merge_polyreader); DECREF(updated_deletions); } return true; }