DefaultHighlightReader* DefHLReader_init(DefaultHighlightReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, Vector *segments, int32_t seg_tick) { HLReader_init((HighlightReader*)self, schema, folder, snapshot, segments, seg_tick); DefaultHighlightReaderIVARS *const ivars = DefHLReader_IVARS(self); Segment *segment = DefHLReader_Get_Segment(self); Hash *metadata = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "highlight", 9); if (!metadata) { metadata = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "term_vectors", 12); } // Check format. if (metadata) { Obj *format = Hash_Fetch_Utf8(metadata, "format", 6); if (!format) { THROW(ERR, "Missing 'format' var"); } else { if (Json_obj_to_i64(format) != HLWriter_current_file_format) { THROW(ERR, "Unsupported highlight data format: %i64", Json_obj_to_i64(format)); } } } // Open instreams. String *seg_name = Seg_Get_Name(segment); String *ix_file = Str_newf("%o/highlight.ix", seg_name); String *dat_file = Str_newf("%o/highlight.dat", seg_name); if (Folder_Exists(folder, ix_file)) { ivars->ix_in = Folder_Open_In(folder, ix_file); if (!ivars->ix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } ivars->dat_in = Folder_Open_In(folder, dat_file); if (!ivars->dat_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } } DECREF(ix_file); DECREF(dat_file); return self; }
Snapshot* Snapshot_Read_File_IMP(Snapshot *self, Folder *folder, String *path) { SnapshotIVARS *const ivars = Snapshot_IVARS(self); // Eliminate all prior data. Pick a snapshot file. S_zero_out(self); ivars->path = (path != NULL && Str_Get_Size(path) > 0) ? Str_Clone(path) : IxFileNames_latest_snapshot(folder); if (ivars->path) { Hash *snap_data = (Hash*)CERTIFY(Json_slurp_json(folder, ivars->path), HASH); Obj *format_obj = CERTIFY(Hash_Fetch_Utf8(snap_data, "format", 6), OBJ); int32_t format = (int32_t)Json_obj_to_i64(format_obj); Obj *subformat_obj = Hash_Fetch_Utf8(snap_data, "subformat", 9); int32_t subformat = subformat_obj ? (int32_t)Json_obj_to_i64(subformat_obj) : 0; // Verify that we can read the index properly. if (format > Snapshot_current_file_format) { THROW(ERR, "Snapshot format too recent: %i32, %i32", format, Snapshot_current_file_format); } // Build up list of entries. Vector *list = (Vector*)INCREF(CERTIFY( Hash_Fetch_Utf8(snap_data, "entries", 7), VECTOR)); if (format == 1 || (format == 2 && subformat < 1)) { Vector *cleaned = S_clean_segment_contents(list); DECREF(list); list = cleaned; } Hash_Clear(ivars->entries); for (uint32_t i = 0, max = Vec_Get_Size(list); i < max; i++) { String *entry = (String*)CERTIFY(Vec_Fetch(list, i), STRING); Hash_Store(ivars->entries, entry, (Obj*)CFISH_TRUE); } DECREF(list); DECREF(snap_data); } return self; }
DefaultDocReader* DefDocReader_init(DefaultDocReader *self, Schema *schema, Folder *folder, Snapshot *snapshot, Vector *segments, int32_t seg_tick) { Hash *metadata; Segment *segment; DocReader_init((DocReader*)self, schema, folder, snapshot, segments, seg_tick); DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self); segment = DefDocReader_Get_Segment(self); metadata = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "documents", 9); if (metadata) { String *seg_name = Seg_Get_Name(segment); String *ix_file = Str_newf("%o/documents.ix", seg_name); String *dat_file = Str_newf("%o/documents.dat", seg_name); Obj *format = Hash_Fetch_Utf8(metadata, "format", 6); // Check format. if (!format) { THROW(ERR, "Missing 'format' var"); } else { int64_t format_val = Json_obj_to_i64(format); if (format_val < DocWriter_current_file_format) { THROW(ERR, "Obsolete doc storage format %i64; " "Index regeneration is required", format_val); } else if (format_val != DocWriter_current_file_format) { THROW(ERR, "Unsupported doc storage format: %i64", format_val); } } // Get streams. if (Folder_Exists(folder, ix_file)) { ivars->ix_in = Folder_Open_In(folder, ix_file); if (!ivars->ix_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } ivars->dat_in = Folder_Open_In(folder, dat_file); if (!ivars->dat_in) { Err *error = (Err*)INCREF(Err_get_error()); DECREF(ix_file); DECREF(dat_file); DECREF(self); RETHROW(error); } } DECREF(ix_file); DECREF(dat_file); } return self; }
Obj* ProximityQuery_Load_IMP(ProximityQuery *self, Obj *dump) { Hash *source = (Hash*)CERTIFY(dump, HASH); ProximityQuery_Load_t super_load = SUPER_METHOD_PTR(PROXIMITYQUERY, LUCY_ProximityQuery_Load); ProximityQuery *loaded = (ProximityQuery*)super_load(self, dump); ProximityQueryIVARS *loaded_ivars = ProximityQuery_IVARS(loaded); Obj *field = CERTIFY(Hash_Fetch_Utf8(source, "field", 5), OBJ); loaded_ivars->field = (String*)CERTIFY(Freezer_load(field), STRING); Obj *terms = CERTIFY(Hash_Fetch_Utf8(source, "terms", 5), OBJ); loaded_ivars->terms = (Vector*)CERTIFY(Freezer_load(terms), VECTOR); Obj *within = CERTIFY(Hash_Fetch_Utf8(source, "within", 6), OBJ); loaded_ivars->within = (uint32_t)Json_obj_to_i64(within); return (Obj*)loaded; }
void DefDelWriter_Merge_Segment_IMP(DefaultDeletionsWriter *self, SegReader *reader, I32Array *doc_map) { DefaultDeletionsWriterIVARS *const ivars = DefDelWriter_IVARS(self); UNUSED_VAR(doc_map); Segment *segment = SegReader_Get_Segment(reader); Hash *del_meta = (Hash*)Seg_Fetch_Metadata_Utf8(segment, "deletions", 9); if (del_meta) { Vector *seg_readers = ivars->seg_readers; Hash *files = (Hash*)Hash_Fetch_Utf8(del_meta, "files", 5); if (files) { HashIterator *iter = HashIter_new(files); while (HashIter_Next(iter)) { String *seg = HashIter_Get_Key(iter); Hash *mini_meta = (Hash*)HashIter_Get_Value(iter); /* Find the segment the deletions from the SegReader * we're adding correspond to. If it's gone, we don't * need to worry about losing deletions files that point * at it. */ for (size_t i = 0, max = Vec_Get_Size(seg_readers); i < max; i++) { SegReader *candidate = (SegReader*)Vec_Fetch(seg_readers, i); String *candidate_name = Seg_Get_Name(SegReader_Get_Segment(candidate)); if (Str_Equals(seg, (Obj*)candidate_name)) { /* If the count hasn't changed, we're about to * merge away the most recent deletions file * pointing at this target segment -- so force a * new file to be written out. */ int32_t count = (int32_t)Json_obj_to_i64(Hash_Fetch_Utf8(mini_meta, "count", 5)); DeletionsReader *del_reader = (DeletionsReader*)SegReader_Obtain( candidate, Class_Get_Name(DELETIONSREADER)); if (count == DelReader_Del_Count(del_reader)) { ivars->updated[i] = true; } break; } } } DECREF(iter); } } }
bool Seg_Read_File_IMP(Segment *self, Folder *folder) { SegmentIVARS *const ivars = Seg_IVARS(self); String *filename = Str_newf("%o/segmeta.json", ivars->name); Hash *metadata = (Hash*)Json_slurp_json(folder, filename); Hash *my_metadata; // Bail unless the segmeta file was read successfully. DECREF(filename); if (!metadata) { return false; } CERTIFY(metadata, HASH); // Grab metadata for the Segment object itself. DECREF(ivars->metadata); ivars->metadata = metadata; my_metadata = (Hash*)CERTIFY(Hash_Fetch_Utf8(ivars->metadata, "segmeta", 7), HASH); // Assign. Obj *count = Hash_Fetch_Utf8(my_metadata, "count", 5); if (!count) { count = Hash_Fetch_Utf8(my_metadata, "doc_count", 9); } if (!count) { THROW(ERR, "Missing 'count'"); } else { ivars->count = Json_obj_to_i64(count); } // Get list of field nums. Vector *source_by_num = (Vector*)Hash_Fetch_Utf8(my_metadata, "field_names", 11); size_t num_fields = source_by_num ? Vec_Get_Size(source_by_num) : 0; if (source_by_num == NULL) { THROW(ERR, "Failed to extract 'field_names' from metadata"); } // Init. DECREF(ivars->by_num); DECREF(ivars->by_name); ivars->by_num = Vec_new(num_fields); ivars->by_name = Hash_new(num_fields); // Copy the list of fields from the source. for (size_t i = 0; i < num_fields; i++) { String *name = (String*)Vec_Fetch(source_by_num, i); Seg_Add_Field(self, name); } return true; }
static void S_zap_dead_merge(FilePurger *self, Hash *candidates) { FilePurgerIVARS *const ivars = FilePurger_IVARS(self); IndexManager *manager = ivars->manager; Lock *merge_lock = IxManager_Make_Merge_Lock(manager); Lock_Clear_Stale(merge_lock); if (!Lock_Is_Locked(merge_lock)) { Hash *merge_data = IxManager_Read_Merge_Data(manager); Obj *cutoff = merge_data ? Hash_Fetch_Utf8(merge_data, "cutoff", 6) : NULL; if (cutoff) { String *cutoff_seg = Seg_num_to_name(Json_obj_to_i64(cutoff)); if (Folder_Exists(ivars->folder, cutoff_seg)) { String *merge_json = SSTR_WRAP_UTF8("merge.json", 10); DirHandle *dh = Folder_Open_Dir(ivars->folder, cutoff_seg); if (!dh) { THROW(ERR, "Can't open segment dir '%o'", cutoff_seg); } Hash_Store(candidates, cutoff_seg, (Obj*)CFISH_TRUE); Hash_Store(candidates, merge_json, (Obj*)CFISH_TRUE); while (DH_Next(dh)) { // TODO: recursively delete subdirs within seg dir. String *entry = DH_Get_Entry(dh); String *filepath = Str_newf("%o/%o", cutoff_seg, entry); Hash_Store(candidates, filepath, (Obj*)CFISH_TRUE); DECREF(filepath); DECREF(entry); } DECREF(dh); } DECREF(cutoff_seg); } DECREF(merge_data); } DECREF(merge_lock); return; }
static void test_offsets(TestBatchRunner *runner) { Folder *folder = S_folder_with_contents(); CompoundFileWriter *cf_writer = CFWriter_new(folder); Hash *cf_metadata; Hash *files; CFWriter_Consolidate(cf_writer); cf_metadata = (Hash*)CERTIFY( Json_slurp_json(folder, cfmeta_file), HASH); files = (Hash*)CERTIFY( Hash_Fetch_Utf8(cf_metadata, "files", 5), HASH); bool offsets_ok = true; TEST_TRUE(runner, Hash_Get_Size(files) > 0, "Multiple files"); HashIterator *iter = HashIter_new(files); while (HashIter_Next(iter)) { String *file = HashIter_Get_Key(iter); Hash *stats = (Hash*)CERTIFY(HashIter_Get_Value(iter), HASH); Obj *offset = CERTIFY(Hash_Fetch_Utf8(stats, "offset", 6), OBJ); int64_t offs = Json_obj_to_i64(offset); if (offs % 8 != 0) { offsets_ok = false; FAIL(runner, "Offset %" PRId64 " for %s not a multiple of 8", offset, Str_Get_Ptr8(file)); break; } } DECREF(iter); if (offsets_ok) { PASS(runner, "All offsets are multiples of 8"); } DECREF(cf_metadata); DECREF(cf_writer); DECREF(folder); }
Schema* Schema_Load_IMP(Schema *self, Obj *dump) { Hash *source = (Hash*)CERTIFY(dump, HASH); String *class_name = (String*)CERTIFY(Hash_Fetch_Utf8(source, "_class", 6), STRING); Class *klass = Class_singleton(class_name, NULL); Schema *loaded = (Schema*)Class_Make_Obj(klass); Hash *type_dumps = (Hash*)CERTIFY(Hash_Fetch_Utf8(source, "fields", 6), HASH); Vector *analyzer_dumps = (Vector*)CERTIFY(Hash_Fetch_Utf8(source, "analyzers", 9), VECTOR); Vector *analyzers = (Vector*)Freezer_load((Obj*)analyzer_dumps); UNUSED_VAR(self); // Start with a blank Schema. Schema_init(loaded); SchemaIVARS *const loaded_ivars = Schema_IVARS(loaded); Vec_Grow(loaded_ivars->uniq_analyzers, Vec_Get_Size(analyzers)); HashIterator *iter = HashIter_new(type_dumps); while (HashIter_Next(iter)) { String *field = HashIter_Get_Key(iter); Hash *type_dump = (Hash*)CERTIFY(HashIter_Get_Value(iter), HASH); String *type_str = (String*)Hash_Fetch_Utf8(type_dump, "type", 4); if (type_str) { if (Str_Equals_Utf8(type_str, "fulltext", 8)) { // Replace the "analyzer" tick with the real thing. Obj *tick = CERTIFY(Hash_Fetch_Utf8(type_dump, "analyzer", 8), OBJ); Analyzer *analyzer = (Analyzer*)Vec_Fetch(analyzers, (uint32_t)Json_obj_to_i64(tick)); if (!analyzer) { THROW(ERR, "Can't find analyzer for '%o'", field); } Hash_Store_Utf8(type_dump, "analyzer", 8, INCREF(analyzer)); FullTextType *type = (FullTextType*)S_load_type(FULLTEXTTYPE, (Obj*)type_dump); Schema_Spec_Field(loaded, field, (FieldType*)type); DECREF(type); } else if (Str_Equals_Utf8(type_str, "string", 6)) { StringType *type = (StringType*)S_load_type(STRINGTYPE, (Obj*)type_dump); Schema_Spec_Field(loaded, field, (FieldType*)type); DECREF(type); } else if (Str_Equals_Utf8(type_str, "blob", 4)) { BlobType *type = (BlobType*)S_load_type(BLOBTYPE, (Obj*)type_dump); Schema_Spec_Field(loaded, field, (FieldType*)type); DECREF(type); } else if (Str_Equals_Utf8(type_str, "i32_t", 5)) { Int32Type *type = (Int32Type*)S_load_type(INT32TYPE, (Obj*)type_dump); Schema_Spec_Field(loaded, field, (FieldType*)type); DECREF(type); } else if (Str_Equals_Utf8(type_str, "i64_t", 5)) { Int64Type *type = (Int64Type*)S_load_type(INT64TYPE, (Obj*)type_dump); Schema_Spec_Field(loaded, field, (FieldType*)type); DECREF(type); } else if (Str_Equals_Utf8(type_str, "f32_t", 5)) { Float32Type *type = (Float32Type*)S_load_type(FLOAT32TYPE, (Obj*)type_dump); Schema_Spec_Field(loaded, field, (FieldType*)type); DECREF(type); } else if (Str_Equals_Utf8(type_str, "f64_t", 5)) { Float64Type *type = (Float64Type*)S_load_type(FLOAT64TYPE, (Obj*)type_dump); Schema_Spec_Field(loaded, field, (FieldType*)type); DECREF(type); } else { THROW(ERR, "Unknown type '%o' for field '%o'", type_str, field); } } else { FieldType *type = (FieldType*)CERTIFY(Freezer_load((Obj*)type_dump), FIELDTYPE); Schema_Spec_Field(loaded, field, type); DECREF(type); } } DECREF(iter); DECREF(analyzers); return loaded; }
Indexer* Indexer_init(Indexer *self, Schema *schema, Obj *index, IndexManager *manager, int32_t flags) { IndexerIVARS *const ivars = Indexer_IVARS(self); bool create = (flags & Indexer_CREATE) ? true : false; bool truncate = (flags & Indexer_TRUNCATE) ? true : false; Folder *folder = S_init_folder(index, create); Snapshot *latest_snapshot = Snapshot_new(); // Init. ivars->stock_doc = Doc_new(NULL, 0); ivars->truncate = false; ivars->optimize = false; ivars->prepared = false; ivars->needs_commit = false; ivars->snapfile = NULL; ivars->merge_lock = NULL; // Assign. ivars->folder = folder; ivars->manager = manager ? (IndexManager*)INCREF(manager) : IxManager_new(NULL, NULL); IxManager_Set_Folder(ivars->manager, folder); // Get a write lock for this folder. Lock *write_lock = IxManager_Make_Write_Lock(ivars->manager); Lock_Clear_Stale(write_lock); if (Lock_Obtain(write_lock)) { // Only assign if successful, otherwise DESTROY unlocks -- bad! ivars->write_lock = write_lock; } else { DECREF(write_lock); DECREF(self); RETHROW(INCREF(Err_get_error())); } // Find the latest snapshot or create a new one. String *latest_snapfile = IxFileNames_latest_snapshot(folder); if (latest_snapfile) { Snapshot_Read_File(latest_snapshot, folder, latest_snapfile); } // Look for an existing Schema if one wasn't supplied. if (schema) { ivars->schema = (Schema*)INCREF(schema); } else { if (!latest_snapfile) { S_release_write_lock(self); THROW(ERR, "No Schema supplied, and can't find one in the index"); } else { String *schema_file = S_find_schema_file(latest_snapshot); Obj *dump = Json_slurp_json(folder, schema_file); if (dump) { // read file successfully ivars->schema = (Schema*)CERTIFY(Freezer_load(dump), SCHEMA); schema = ivars->schema; DECREF(dump); schema_file = NULL; } else { THROW(ERR, "Failed to parse %o", schema_file); } } } // If we're clobbering, start with an empty Snapshot and an empty // PolyReader. Otherwise, start with the most recent Snapshot and an // up-to-date PolyReader. if (truncate) { ivars->snapshot = Snapshot_new(); ivars->polyreader = PolyReader_new(schema, folder, NULL, NULL, NULL); ivars->truncate = true; } else { // TODO: clone most recent snapshot rather than read it twice. ivars->snapshot = (Snapshot*)INCREF(latest_snapshot); ivars->polyreader = latest_snapfile ? PolyReader_open((Obj*)folder, NULL, NULL) : PolyReader_new(schema, folder, NULL, NULL, NULL); if (latest_snapfile) { // Make sure than any existing fields which may have been // dynamically added during past indexing sessions get added. Schema *old_schema = PolyReader_Get_Schema(ivars->polyreader); Schema_Eat(schema, old_schema); } } // Zap detritus from previous sessions. // Note: we have to feed FilePurger with the most recent snapshot file // now, but with the Indexer's snapshot later. FilePurger *file_purger = FilePurger_new(folder, latest_snapshot, ivars->manager); FilePurger_Purge(file_purger); DECREF(file_purger); // Create a new segment. int64_t new_seg_num = IxManager_Highest_Seg_Num(ivars->manager, latest_snapshot) + 1; Lock *merge_lock = IxManager_Make_Merge_Lock(ivars->manager); if (Lock_Is_Locked(merge_lock)) { // If there's a background merge process going on, stay out of its // way. Hash *merge_data = IxManager_Read_Merge_Data(ivars->manager); Obj *cutoff_obj = merge_data ? Hash_Fetch_Utf8(merge_data, "cutoff", 6) : NULL; if (!cutoff_obj) { DECREF(merge_lock); DECREF(merge_data); THROW(ERR, "Background merge detected, but can't read merge data"); } else { int64_t cutoff = Json_obj_to_i64(cutoff_obj); if (cutoff >= new_seg_num) { new_seg_num = cutoff + 1; } } DECREF(merge_data); } ivars->segment = Seg_new(new_seg_num); // Add all known fields to Segment. Vector *fields = Schema_All_Fields(schema); for (size_t i = 0, max = Vec_Get_Size(fields); i < max; i++) { Seg_Add_Field(ivars->segment, (String*)Vec_Fetch(fields, i)); } DECREF(fields); DECREF(merge_lock); // Create new SegWriter and FilePurger. ivars->file_purger = FilePurger_new(folder, ivars->snapshot, ivars->manager); ivars->seg_writer = SegWriter_new(ivars->schema, ivars->snapshot, ivars->segment, ivars->polyreader); SegWriter_Prep_Seg_Dir(ivars->seg_writer); // Grab a local ref to the DeletionsWriter. ivars->del_writer = (DeletionsWriter*)INCREF( SegWriter_Get_Del_Writer(ivars->seg_writer)); DECREF(latest_snapfile); DECREF(latest_snapshot); return self; }
static bool S_maybe_merge(Indexer *self, Vector *seg_readers) { IndexerIVARS *const ivars = Indexer_IVARS(self); bool merge_happened = false; size_t num_seg_readers = Vec_Get_Size(seg_readers); Lock *merge_lock = IxManager_Make_Merge_Lock(ivars->manager); bool got_merge_lock = Lock_Obtain(merge_lock); int64_t cutoff; if (got_merge_lock) { ivars->merge_lock = merge_lock; cutoff = 0; } else { // If something else holds the merge lock, don't interfere. Hash *merge_data = IxManager_Read_Merge_Data(ivars->manager); if (merge_data) { Obj *cutoff_obj = Hash_Fetch_Utf8(merge_data, "cutoff", 6); if (cutoff_obj) { cutoff = Json_obj_to_i64(cutoff_obj); } else { cutoff = INT64_MAX; } DECREF(merge_data); } else { cutoff = INT64_MAX; } DECREF(merge_lock); } // Get a list of segments to recycle. Validate and confirm that there are // no dupes in the list. Vector *to_merge = IxManager_Recycle(ivars->manager, ivars->polyreader, ivars->del_writer, cutoff, ivars->optimize); Hash *seen = Hash_new(Vec_Get_Size(to_merge)); for (size_t i = 0, max = Vec_Get_Size(to_merge); i < max; i++) { SegReader *seg_reader = (SegReader*)CERTIFY(Vec_Fetch(to_merge, i), SEGREADER); String *seg_name = SegReader_Get_Seg_Name(seg_reader); if (Hash_Fetch(seen, seg_name)) { DECREF(seen); DECREF(to_merge); THROW(ERR, "Recycle() tried to merge segment '%o' twice", seg_name); } Hash_Store(seen, seg_name, (Obj*)CFISH_TRUE); } DECREF(seen); // Consolidate segments if either sparse or optimizing forced. for (size_t i = 0, max = Vec_Get_Size(to_merge); i < max; i++) { SegReader *seg_reader = (SegReader*)Vec_Fetch(to_merge, i); int64_t seg_num = SegReader_Get_Seg_Num(seg_reader); Matcher *deletions = DelWriter_Seg_Deletions(ivars->del_writer, seg_reader); I32Array *doc_map = DelWriter_Generate_Doc_Map( ivars->del_writer, deletions, SegReader_Doc_Max(seg_reader), (int32_t)Seg_Get_Count(ivars->segment)); if (seg_num <= cutoff) { THROW(ERR, "Segment %o violates cutoff (%i64 <= %i64)", SegReader_Get_Seg_Name(seg_reader), seg_num, cutoff); } SegWriter_Merge_Segment(ivars->seg_writer, seg_reader, doc_map); merge_happened = true; DECREF(deletions); DECREF(doc_map); } // Write out new deletions. if (DelWriter_Updated(ivars->del_writer)) { // Only write out if they haven't all been applied. if (Vec_Get_Size(to_merge) != num_seg_readers) { DelWriter_Finish(ivars->del_writer); } } DECREF(to_merge); return merge_happened; }