static void test_spew_and_slurp(TestBatch *batch) { Obj *dump = S_make_dump(); Folder *folder = (Folder*)RAMFolder_new(NULL); CharBuf *foo = (CharBuf*)ZCB_WRAP_STR("foo", 3); bool_t result = Json_spew_json(dump, folder, foo); TEST_TRUE(batch, result, "spew_json returns true on success"); TEST_TRUE(batch, Folder_Exists(folder, foo), "spew_json wrote file"); Obj *got = Json_slurp_json(folder, foo); TEST_TRUE(batch, got && Obj_Equals(dump, got), "Round trip through spew_json and slurp_json"); DECREF(got); Err_set_error(NULL); result = Json_spew_json(dump, folder, foo); TEST_FALSE(batch, result, "Can't spew_json when file exists"); TEST_TRUE(batch, Err_get_error() != NULL, "Failed spew_json sets Err_error"); Err_set_error(NULL); CharBuf *bar = (CharBuf*)ZCB_WRAP_STR("bar", 3); got = Json_slurp_json(folder, bar); TEST_TRUE(batch, got == NULL, "slurp_json returns NULL when file doesn't exist"); TEST_TRUE(batch, Err_get_error() != NULL, "Failed slurp_json sets Err_error"); CharBuf *boffo = (CharBuf*)ZCB_WRAP_STR("boffo", 5); FileHandle *fh = Folder_Open_FileHandle(folder, boffo, FH_CREATE | FH_WRITE_ONLY); FH_Write(fh, "garbage", 7); DECREF(fh); Err_set_error(NULL); got = Json_slurp_json(folder, boffo); TEST_TRUE(batch, got == NULL, "slurp_json returns NULL when file doesn't contain valid JSON"); TEST_TRUE(batch, Err_get_error() != NULL, "Failed slurp_json sets Err_error"); DECREF(got); DECREF(dump); DECREF(folder); }
static void test_stemming(TestBatchRunner *runner) { FSFolder *modules_folder = TestUtils_modules_folder(); String *path = Str_newf("analysis/snowstem/source/test/tests.json"); Hash *tests = (Hash*)Json_slurp_json((Folder*)modules_folder, path); if (!tests) { RETHROW(Err_get_error()); } String *iso; Hash *lang_data; Hash_Iterate(tests); while (Hash_Next(tests, (Obj**)&iso, (Obj**)&lang_data)) { VArray *words = (VArray*)Hash_Fetch_Utf8(lang_data, "words", 5); VArray *stems = (VArray*)Hash_Fetch_Utf8(lang_data, "stems", 5); SnowballStemmer *stemmer = SnowStemmer_new(iso); for (uint32_t i = 0, max = VA_Get_Size(words); i < max; i++) { String *word = (String*)VA_Fetch(words, i); VArray *got = SnowStemmer_Split(stemmer, word); String *stem = (String*)VA_Fetch(got, 0); TEST_TRUE(runner, stem && Str_Is_A(stem, STRING) && Str_Equals(stem, VA_Fetch(stems, i)), "Stem %s: %s", Str_Get_Ptr8(iso), Str_Get_Ptr8(word) ); DECREF(got); } DECREF(stemmer); } DECREF(tests); DECREF(modules_folder); DECREF(path); }
bool LFLock_Maybe_Delete_File_IMP(LockFileLock *self, String *path, bool delete_mine, bool delete_other) { LockFileLockIVARS *const ivars = LFLock_IVARS(self); Folder *folder = ivars->folder; bool success = false; // Only delete locks that start with our lock name. if (!Str_Starts_With_Utf8(path, "locks", 5)) { return false; } StringIterator *iter = Str_Top(path); StrIter_Advance(iter, 5 + 1); if (!StrIter_Starts_With(iter, ivars->name)) { DECREF(iter); return false; } DECREF(iter); // Attempt to delete dead lock file. if (Folder_Exists(folder, path)) { Hash *hash = (Hash*)Json_slurp_json(folder, path); if (hash != NULL && Obj_Is_A((Obj*)hash, HASH)) { String *pid_buf = (String*)Hash_Fetch_Utf8(hash, "pid", 3); String *host = (String*)Hash_Fetch_Utf8(hash, "host", 4); String *name = (String*)Hash_Fetch_Utf8(hash, "name", 4); // Match hostname and lock name. if (host != NULL && Str_Is_A(host, STRING) && Str_Equals(host, (Obj*)ivars->host) && name != NULL && Str_Is_A(name, STRING) && Str_Equals(name, (Obj*)ivars->name) && pid_buf != NULL && Str_Is_A(pid_buf, STRING) ) { // Verify that pid is either mine or dead. int pid = (int)Str_To_I64(pid_buf); if ((delete_mine && pid == PID_getpid()) // This process. || (delete_other && !PID_active(pid)) // Dead pid. ) { if (Folder_Delete(folder, path)) { success = true; } else { String *mess = MAKE_MESS("Can't delete '%o'", path); DECREF(hash); Err_throw_mess(ERR, mess); } } } } DECREF(hash); } return success; }
bool_t LFLock_maybe_delete_file(LockFileLock *self, const CharBuf *path, bool_t delete_mine, bool_t delete_other) { Folder *folder = self->folder; bool_t success = false; ZombieCharBuf *scratch = ZCB_WRAP(path); // Only delete locks that start with our lock name. CharBuf *lock_dir_name = (CharBuf*)ZCB_WRAP_STR("locks", 5); if (!ZCB_Starts_With(scratch, lock_dir_name)) { return false; } ZCB_Nip(scratch, CB_Get_Size(lock_dir_name) + 1); if (!ZCB_Starts_With(scratch, self->name)) { return false; } // Attempt to delete dead lock file. if (Folder_Exists(folder, path)) { Hash *hash = (Hash*)Json_slurp_json(folder, path); if (hash != NULL && Obj_Is_A((Obj*)hash, HASH)) { CharBuf *pid_buf = (CharBuf*)Hash_Fetch_Str(hash, "pid", 3); CharBuf *host = (CharBuf*)Hash_Fetch_Str(hash, "host", 4); CharBuf *name = (CharBuf*)Hash_Fetch_Str(hash, "name", 4); // Match hostname and lock name. if (host != NULL && CB_Equals(host, (Obj*)self->host) && name != NULL && CB_Equals(name, (Obj*)self->name) && pid_buf != NULL ) { // Verify that pid is either mine or dead. int pid = (int)CB_To_I64(pid_buf); if ((delete_mine && pid == PID_getpid()) // This process. || (delete_other && !PID_active(pid)) // Dead pid. ) { if (Folder_Delete(folder, path)) { success = true; } else { CharBuf *mess = MAKE_MESS("Can't delete '%o'", path); DECREF(hash); Err_throw_mess(ERR, mess); } } } } DECREF(hash); } return success; }
Snapshot* Snapshot_Read_File_IMP(Snapshot *self, Folder *folder, String *path) { SnapshotIVARS *const ivars = Snapshot_IVARS(self); // Eliminate all prior data. Pick a snapshot file. S_zero_out(self); ivars->path = (path != NULL && Str_Get_Size(path) > 0) ? Str_Clone(path) : IxFileNames_latest_snapshot(folder); if (ivars->path) { Hash *snap_data = (Hash*)CERTIFY(Json_slurp_json(folder, ivars->path), HASH); Obj *format_obj = CERTIFY(Hash_Fetch_Utf8(snap_data, "format", 6), OBJ); int32_t format = (int32_t)Json_obj_to_i64(format_obj); Obj *subformat_obj = Hash_Fetch_Utf8(snap_data, "subformat", 9); int32_t subformat = subformat_obj ? (int32_t)Json_obj_to_i64(subformat_obj) : 0; // Verify that we can read the index properly. if (format > Snapshot_current_file_format) { THROW(ERR, "Snapshot format too recent: %i32, %i32", format, Snapshot_current_file_format); } // Build up list of entries. Vector *list = (Vector*)INCREF(CERTIFY( Hash_Fetch_Utf8(snap_data, "entries", 7), VECTOR)); if (format == 1 || (format == 2 && subformat < 1)) { Vector *cleaned = S_clean_segment_contents(list); DECREF(list); list = cleaned; } Hash_Clear(ivars->entries); for (uint32_t i = 0, max = Vec_Get_Size(list); i < max; i++) { String *entry = (String*)CERTIFY(Vec_Fetch(list, i), STRING); Hash_Store(ivars->entries, entry, (Obj*)CFISH_TRUE); } DECREF(list); DECREF(snap_data); } return self; }
Snapshot* Snapshot_read_file(Snapshot *self, Folder *folder, const CharBuf *path) { // Eliminate all prior data. Pick a snapshot file. S_zero_out(self); self->path = path ? CB_Clone(path) : IxFileNames_latest_snapshot(folder); if (self->path) { Hash *snap_data = (Hash*)CERTIFY( Json_slurp_json(folder, self->path), HASH); Obj *format_obj = CERTIFY( Hash_Fetch_Str(snap_data, "format", 6), OBJ); int32_t format = (int32_t)Obj_To_I64(format_obj); Obj *subformat_obj = Hash_Fetch_Str(snap_data, "subformat", 9); int32_t subformat = subformat_obj ? (int32_t)Obj_To_I64(subformat_obj) : 0; // Verify that we can read the index properly. if (format > Snapshot_current_file_format) { THROW(ERR, "Snapshot format too recent: %i32, %i32", format, Snapshot_current_file_format); } // Build up list of entries. VArray *list = (VArray*)CERTIFY( Hash_Fetch_Str(snap_data, "entries", 7), VARRAY); INCREF(list); if (format == 1 || (format == 2 && subformat < 1)) { VArray *cleaned = S_clean_segment_contents(list); DECREF(list); list = cleaned; } Hash_Clear(self->entries); for (uint32_t i = 0, max = VA_Get_Size(list); i < max; i++) { CharBuf *entry = (CharBuf*)CERTIFY( VA_Fetch(list, i), CHARBUF); Hash_Store(self->entries, (Obj*)entry, INCREF(&EMPTY)); } DECREF(list); DECREF(snap_data); } return self; }
bool Seg_Read_File_IMP(Segment *self, Folder *folder) { SegmentIVARS *const ivars = Seg_IVARS(self); String *filename = Str_newf("%o/segmeta.json", ivars->name); Hash *metadata = (Hash*)Json_slurp_json(folder, filename); Hash *my_metadata; // Bail unless the segmeta file was read successfully. DECREF(filename); if (!metadata) { return false; } CERTIFY(metadata, HASH); // Grab metadata for the Segment object itself. DECREF(ivars->metadata); ivars->metadata = metadata; my_metadata = (Hash*)CERTIFY(Hash_Fetch_Utf8(ivars->metadata, "segmeta", 7), HASH); // Assign. Obj *count = Hash_Fetch_Utf8(my_metadata, "count", 5); if (!count) { count = Hash_Fetch_Utf8(my_metadata, "doc_count", 9); } if (!count) { THROW(ERR, "Missing 'count'"); } else { ivars->count = Json_obj_to_i64(count); } // Get list of field nums. Vector *source_by_num = (Vector*)Hash_Fetch_Utf8(my_metadata, "field_names", 11); size_t num_fields = source_by_num ? Vec_Get_Size(source_by_num) : 0; if (source_by_num == NULL) { THROW(ERR, "Failed to extract 'field_names' from metadata"); } // Init. DECREF(ivars->by_num); DECREF(ivars->by_name); ivars->by_num = Vec_new(num_fields); ivars->by_name = Hash_new(num_fields); // Copy the list of fields from the source. for (size_t i = 0; i < num_fields; i++) { String *name = (String*)Vec_Fetch(source_by_num, i); Seg_Add_Field(self, name); } return true; }
bool_t Seg_read_file(Segment *self, Folder *folder) { CharBuf *filename = CB_newf("%o/segmeta.json", self->name); Hash *metadata = (Hash*)Json_slurp_json(folder, filename); Hash *my_metadata; // Bail unless the segmeta file was read successfully. DECREF(filename); if (!metadata) { return false; } CERTIFY(metadata, HASH); // Grab metadata for the Segment object itself. DECREF(self->metadata); self->metadata = metadata; my_metadata = (Hash*)CERTIFY(Hash_Fetch_Str(self->metadata, "segmeta", 7), HASH); // Assign. Obj *count = Hash_Fetch_Str(my_metadata, "count", 5); if (!count) { count = Hash_Fetch_Str(my_metadata, "doc_count", 9); } if (!count) { THROW(ERR, "Missing 'count'"); } else { self->count = Obj_To_I64(count); } // Get list of field nums. VArray *source_by_num = (VArray*)Hash_Fetch_Str(my_metadata, "field_names", 11); uint32_t num_fields = source_by_num ? VA_Get_Size(source_by_num) : 0; if (source_by_num == NULL) { THROW(ERR, "Failed to extract 'field_names' from metadata"); } // Init. DECREF(self->by_num); DECREF(self->by_name); self->by_num = VA_new(num_fields); self->by_name = Hash_new(num_fields); // Copy the list of fields from the source. for (uint32_t i = 0; i < num_fields; i++) { CharBuf *name = (CharBuf*)VA_Fetch(source_by_num, i); Seg_Add_Field(self, name); } return true; }
Hash* IxManager_Read_Merge_Data_IMP(IndexManager *self) { IndexManagerIVARS *const ivars = IxManager_IVARS(self); String *merge_json = SSTR_WRAP_C("merge.json"); if (Folder_Exists(ivars->folder, merge_json)) { Hash *stuff = (Hash*)Json_slurp_json(ivars->folder, merge_json); if (stuff) { CERTIFY(stuff, HASH); return stuff; } else { return Hash_new(0); } } else { return NULL; } }
Hash* IxManager_read_merge_data(IndexManager *self) { ZombieCharBuf *merge_json = ZCB_WRAP_STR("merge.json", 10); if (Folder_Exists(self->folder, (CharBuf*)merge_json)) { Hash *stuff = (Hash*)Json_slurp_json(self->folder, (CharBuf*)merge_json); if (stuff) { CERTIFY(stuff, HASH); return stuff; } else { return Hash_new(0); } } else { return NULL; } }
static void test_normalization(TestBatchRunner *runner) { FSFolder *modules_folder = TestUtils_modules_folder(); if (modules_folder == NULL) { SKIP(runner, 13, "Can't locate test data"); return; } String *path = Str_newf("unicode/utf8proc/tests.json"); Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path); if (!tests) { RETHROW(Err_get_error()); } for (uint32_t i = 0, max = Vec_Get_Size(tests); i < max; i++) { Hash *test = (Hash*)Vec_Fetch(tests, i); String *form = (String*)Hash_Fetch_Utf8( test, "normalization_form", 18); bool case_fold = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8( test, "case_fold", 9)); bool strip_accents = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8( test, "strip_accents", 13)); Normalizer *normalizer = Normalizer_new(form, case_fold, strip_accents); Vector *words = (Vector*)Hash_Fetch_Utf8(test, "words", 5); Vector *norms = (Vector*)Hash_Fetch_Utf8(test, "norms", 5); for (uint32_t j = 0, max = Vec_Get_Size(words); j < max; j++) { String *word = (String*)Vec_Fetch(words, j); Vector *got = Normalizer_Split(normalizer, word); String *norm = (String*)Vec_Fetch(got, 0); TEST_TRUE(runner, norm && Str_is_a(norm, STRING) && Str_Equals(norm, Vec_Fetch(norms, j)), "Normalize %s %d %d: %s", Str_Get_Ptr8(form), case_fold, strip_accents, Str_Get_Ptr8(word) ); DECREF(got); } DECREF(normalizer); } DECREF(tests); DECREF(modules_folder); DECREF(path); }
static void test_offsets(TestBatchRunner *runner) { Folder *folder = S_folder_with_contents(); CompoundFileWriter *cf_writer = CFWriter_new(folder); Hash *cf_metadata; Hash *files; CFWriter_Consolidate(cf_writer); cf_metadata = (Hash*)CERTIFY( Json_slurp_json(folder, cfmeta_file), HASH); files = (Hash*)CERTIFY( Hash_Fetch_Utf8(cf_metadata, "files", 5), HASH); bool offsets_ok = true; TEST_TRUE(runner, Hash_Get_Size(files) > 0, "Multiple files"); HashIterator *iter = HashIter_new(files); while (HashIter_Next(iter)) { String *file = HashIter_Get_Key(iter); Hash *stats = (Hash*)CERTIFY(HashIter_Get_Value(iter), HASH); Obj *offset = CERTIFY(Hash_Fetch_Utf8(stats, "offset", 6), OBJ); int64_t offs = Obj_To_I64(offset); if (offs % 8 != 0) { offsets_ok = false; FAIL(runner, "Offset %" PRId64 " for %s not a multiple of 8", offset, Str_Get_Ptr8(file)); break; } } DECREF(iter); if (offsets_ok) { PASS(runner, "All offsets are multiples of 8"); } DECREF(cf_metadata); DECREF(cf_writer); DECREF(folder); }
Snapshot* Snapshot_read_file(Snapshot *self, Folder *folder, const CharBuf *filename) { /* Eliminate all prior data. Pick a snapshot file. */ S_zero_out(self); self->filename = filename ? CB_Clone(filename) : IxFileNames_latest_snapshot(folder); if (self->filename) { Hash *snap_data = (Hash*)ASSERT_IS_A( Json_slurp_json(folder, self->filename), HASH); Obj *format = ASSERT_IS_A( Hash_Fetch_Str(snap_data, "format", 6), OBJ); /* Verify that we can read the index properly. */ if (Obj_To_I64(format) > Snapshot_current_file_format) { THROW("Snapshot format too recent: %i64, %i32", Obj_To_I64(format), Snapshot_current_file_format); } /* Build up list of entries. */ { u32_t i, max; VArray *list = (VArray*)ASSERT_IS_A( Hash_Fetch_Str(snap_data, "entries", 7), VARRAY); Hash_Clear(self->entries); for (i = 0, max = VA_Get_Size(list); i < max; i++) { CharBuf *entry = (CharBuf*)ASSERT_IS_A( VA_Fetch(list, i), CHARBUF); Hash_Store(self->entries, entry, INCREF(&EMPTY)); } } DECREF(snap_data); } return self; }
static void test_tokenizer(TestBatchRunner *runner) { StandardTokenizer *tokenizer = StandardTokenizer_new(); String *word = SSTR_WRAP_C( " ." "tha\xCC\x82t's" ":" "1,02\xC2\xADZ4.38" "\xE0\xB8\x81\xC2\xAD\xC2\xAD" "\xF0\xA0\x80\x80" "a" "/"); Vector *got = StandardTokenizer_Split(tokenizer, word); String *token = (String*)Vec_Fetch(got, 0); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "tha\xcc\x82t's", 8), "Token: %s", Str_Get_Ptr8(token)); token = (String*)Vec_Fetch(got, 1); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "1,02\xC2\xADZ4.38", 11), "Token: %s", Str_Get_Ptr8(token)); token = (String*)Vec_Fetch(got, 2); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "\xE0\xB8\x81\xC2\xAD\xC2\xAD", 7), "Token: %s", Str_Get_Ptr8(token)); token = (String*)Vec_Fetch(got, 3); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "\xF0\xA0\x80\x80", 4), "Token: %s", Str_Get_Ptr8(token)); token = (String*)Vec_Fetch(got, 4); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "a", 1), "Token: %s", Str_Get_Ptr8(token)); DECREF(got); FSFolder *modules_folder = TestUtils_modules_folder(); if (modules_folder == NULL) { SKIP(runner, 1372, "Can't locate test data"); } else { String *path = Str_newf("unicode/ucd/WordBreakTest.json"); Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path); if (!tests) { RETHROW(Err_get_error()); } for (uint32_t i = 0, max = Vec_Get_Size(tests); i < max; i++) { Hash *test = (Hash*)Vec_Fetch(tests, i); String *text = (String*)Hash_Fetch_Utf8(test, "text", 4); Vector *wanted = (Vector*)Hash_Fetch_Utf8(test, "words", 5); Vector *got = StandardTokenizer_Split(tokenizer, text); TEST_TRUE(runner, Vec_Equals(wanted, (Obj*)got), "UCD test #%d", i + 1); DECREF(got); } DECREF(tests); DECREF(modules_folder); DECREF(path); } DECREF(tokenizer); }
Indexer* Indexer_init(Indexer *self, Schema *schema, Obj *index, IndexManager *manager, int32_t flags) { bool_t create = (flags & Indexer_CREATE) ? true : false; bool_t truncate = (flags & Indexer_TRUNCATE) ? true : false; Folder *folder = S_init_folder(index, create); Lock *write_lock; CharBuf *latest_snapfile; Snapshot *latest_snapshot = Snapshot_new(); // Init. self->stock_doc = Doc_new(NULL, 0); self->truncate = false; self->optimize = false; self->prepared = false; self->needs_commit = false; self->snapfile = NULL; self->merge_lock = NULL; // Assign. self->folder = folder; self->manager = manager ? (IndexManager*)INCREF(manager) : IxManager_new(NULL, NULL); IxManager_Set_Folder(self->manager, folder); // Get a write lock for this folder. write_lock = IxManager_Make_Write_Lock(self->manager); Lock_Clear_Stale(write_lock); if (Lock_Obtain(write_lock)) { // Only assign if successful, otherwise DESTROY unlocks -- bad! self->write_lock = write_lock; } else { DECREF(write_lock); DECREF(self); RETHROW(INCREF(Err_get_error())); } // Find the latest snapshot or create a new one. latest_snapfile = IxFileNames_latest_snapshot(folder); if (latest_snapfile) { Snapshot_Read_File(latest_snapshot, folder, latest_snapfile); } // Look for an existing Schema if one wasn't supplied. if (schema) { self->schema = (Schema*)INCREF(schema); } else { if (!latest_snapfile) { THROW(ERR, "No Schema supplied, and can't find one in the index"); } else { CharBuf *schema_file = S_find_schema_file(latest_snapshot); Hash *dump = (Hash*)Json_slurp_json(folder, schema_file); if (dump) { // read file successfully self->schema = (Schema*)CERTIFY( VTable_Load_Obj(SCHEMA, (Obj*)dump), SCHEMA); schema = self->schema; DECREF(dump); schema_file = NULL; } else { THROW(ERR, "Failed to parse %o", schema_file); } } } // If we're clobbering, start with an empty Snapshot and an empty // PolyReader. Otherwise, start with the most recent Snapshot and an // up-to-date PolyReader. if (truncate) { self->snapshot = Snapshot_new(); self->polyreader = PolyReader_new(schema, folder, NULL, NULL, NULL); self->truncate = true; } else { // TODO: clone most recent snapshot rather than read it twice. self->snapshot = (Snapshot*)INCREF(latest_snapshot); self->polyreader = latest_snapfile ? PolyReader_open((Obj*)folder, NULL, NULL) : PolyReader_new(schema, folder, NULL, NULL, NULL); if (latest_snapfile) { // Make sure than any existing fields which may have been // dynamically added during past indexing sessions get added. Schema *old_schema = PolyReader_Get_Schema(self->polyreader); Schema_Eat(schema, old_schema); } } // Zap detritus from previous sessions. { // Note: we have to feed FilePurger with the most recent snapshot file // now, but with the Indexer's snapshot later. FilePurger *file_purger = FilePurger_new(folder, latest_snapshot, self->manager); FilePurger_Purge(file_purger); DECREF(file_purger); } // Create a new segment. { int64_t new_seg_num = IxManager_Highest_Seg_Num(self->manager, latest_snapshot) + 1; Lock *merge_lock = IxManager_Make_Merge_Lock(self->manager); uint32_t i, max; if (Lock_Is_Locked(merge_lock)) { // If there's a background merge process going on, stay out of its // way. Hash *merge_data = IxManager_Read_Merge_Data(self->manager); Obj *cutoff_obj = merge_data ? Hash_Fetch_Str(merge_data, "cutoff", 6) : NULL; if (!cutoff_obj) { DECREF(merge_lock); DECREF(merge_data); THROW(ERR, "Background merge detected, but can't read merge data"); } else { int64_t cutoff = Obj_To_I64(cutoff_obj); if (cutoff >= new_seg_num) { new_seg_num = cutoff + 1; } } DECREF(merge_data); } self->segment = Seg_new(new_seg_num); // Add all known fields to Segment. { VArray *fields = Schema_All_Fields(schema); for (i = 0, max = VA_Get_Size(fields); i < max; i++) { Seg_Add_Field(self->segment, (CharBuf*)VA_Fetch(fields, i)); } DECREF(fields); } DECREF(merge_lock); } // Create new SegWriter and FilePurger. self->file_purger = FilePurger_new(folder, self->snapshot, self->manager); self->seg_writer = SegWriter_new(self->schema, self->snapshot, self->segment, self->polyreader); SegWriter_Prep_Seg_Dir(self->seg_writer); // Grab a local ref to the DeletionsWriter. self->del_writer = (DeletionsWriter*)INCREF( SegWriter_Get_Del_Writer(self->seg_writer)); DECREF(latest_snapfile); DECREF(latest_snapshot); return self; }
static void test_open(TestBatchRunner *runner) { Folder *real_folder; CompoundFileReader *cf_reader; Hash *metadata; Err_set_error(NULL); real_folder = S_folder_with_contents(); Folder_Delete(real_folder, cfmeta_file); cf_reader = CFReader_open(real_folder); TEST_TRUE(runner, cf_reader == NULL, "Return NULL when cfmeta file missing"); TEST_TRUE(runner, Err_get_error() != NULL, "Set global error when cfmeta file missing"); DECREF(real_folder); Err_set_error(NULL); real_folder = S_folder_with_contents(); Folder_Delete(real_folder, cf_file); cf_reader = CFReader_open(real_folder); TEST_TRUE(runner, cf_reader == NULL, "Return NULL when cf.dat file missing"); TEST_TRUE(runner, Err_get_error() != NULL, "Set global error when cf.dat file missing"); DECREF(real_folder); Err_set_error(NULL); real_folder = S_folder_with_contents(); metadata = (Hash*)Json_slurp_json(real_folder, cfmeta_file); Hash_Store_Utf8(metadata, "format", 6, (Obj*)Str_newf("%i32", -1)); Folder_Delete(real_folder, cfmeta_file); Json_spew_json((Obj*)metadata, real_folder, cfmeta_file); cf_reader = CFReader_open(real_folder); TEST_TRUE(runner, cf_reader == NULL, "Return NULL when format is invalid"); TEST_TRUE(runner, Err_get_error() != NULL, "Set global error when format is invalid"); Err_set_error(NULL); Hash_Store_Utf8(metadata, "format", 6, (Obj*)Str_newf("%i32", 1000)); Folder_Delete(real_folder, cfmeta_file); Json_spew_json((Obj*)metadata, real_folder, cfmeta_file); cf_reader = CFReader_open(real_folder); TEST_TRUE(runner, cf_reader == NULL, "Return NULL when format is too recent"); TEST_TRUE(runner, Err_get_error() != NULL, "Set global error when format too recent"); Err_set_error(NULL); DECREF(Hash_Delete_Utf8(metadata, "format", 6)); Folder_Delete(real_folder, cfmeta_file); Json_spew_json((Obj*)metadata, real_folder, cfmeta_file); cf_reader = CFReader_open(real_folder); TEST_TRUE(runner, cf_reader == NULL, "Return NULL when format key is missing"); TEST_TRUE(runner, Err_get_error() != NULL, "Set global error when format key is missing"); Hash_Store_Utf8(metadata, "format", 6, (Obj*)Str_newf("%i32", CFWriter_current_file_format)); DECREF(Hash_Delete_Utf8(metadata, "files", 5)); Folder_Delete(real_folder, cfmeta_file); Json_spew_json((Obj*)metadata, real_folder, cfmeta_file); cf_reader = CFReader_open(real_folder); TEST_TRUE(runner, cf_reader == NULL, "Return NULL when files key is missing"); TEST_TRUE(runner, Err_get_error() != NULL, "Set global error when files key is missing"); DECREF(metadata); DECREF(real_folder); }
void S_try_open_elements(void *context) { struct try_open_elements_context *args = (struct try_open_elements_context*)context; PolyReader *self = args->self; PolyReaderIVARS *const ivars = PolyReader_IVARS(self); VArray *files = Snapshot_List(ivars->snapshot); Folder *folder = PolyReader_Get_Folder(self); uint32_t num_segs = 0; uint64_t latest_schema_gen = 0; CharBuf *schema_file = NULL; // Find schema file, count segments. for (uint32_t i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *entry = (CharBuf*)VA_Fetch(files, i); if (Seg_valid_seg_name(entry)) { num_segs++; } else if (CB_Starts_With_Str(entry, "schema_", 7) && CB_Ends_With_Str(entry, ".json", 5) ) { uint64_t gen = IxFileNames_extract_gen(entry); if (gen > latest_schema_gen) { latest_schema_gen = gen; if (!schema_file) { schema_file = CB_Clone(entry); } else { CB_Mimic(schema_file, (Obj*)entry); } } } } // Read Schema. if (!schema_file) { DECREF(files); THROW(ERR, "Can't find a schema file."); } else { Hash *dump = (Hash*)Json_slurp_json(folder, schema_file); if (dump) { // read file successfully DECREF(ivars->schema); ivars->schema = (Schema*)CERTIFY( VTable_Load_Obj(SCHEMA, (Obj*)dump), SCHEMA); DECREF(dump); DECREF(schema_file); schema_file = NULL; } else { CharBuf *mess = MAKE_MESS("Failed to parse %o", schema_file); DECREF(schema_file); DECREF(files); Err_throw_mess(ERR, mess); } } VArray *segments = VA_new(num_segs); for (uint32_t i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *entry = (CharBuf*)VA_Fetch(files, i); // Create a Segment for each segmeta. if (Seg_valid_seg_name(entry)) { int64_t seg_num = IxFileNames_extract_gen(entry); Segment *segment = Seg_new(seg_num); // Bail if reading the file fails (probably because it's been // deleted and a new snapshot file has been written so we need to // retry). if (Seg_Read_File(segment, folder)) { VA_Push(segments, (Obj*)segment); } else { CharBuf *mess = MAKE_MESS("Failed to read %o", entry); DECREF(segment); DECREF(segments); DECREF(files); Err_throw_mess(ERR, mess); } } } // Sort the segments by age. VA_Sort(segments, NULL, NULL); // Open individual SegReaders. struct try_open_segreader_context seg_context; seg_context.schema = PolyReader_Get_Schema(self); seg_context.folder = folder; seg_context.snapshot = PolyReader_Get_Snapshot(self); seg_context.segments = segments; seg_context.result = NULL; args->seg_readers = VA_new(num_segs); Err *error = NULL; for (uint32_t seg_tick = 0; seg_tick < num_segs; seg_tick++) { seg_context.seg_tick = seg_tick; error = Err_trap(S_try_open_segreader, &seg_context); if (error) { break; } VA_Push(args->seg_readers, (Obj*)seg_context.result); seg_context.result = NULL; } DECREF(segments); DECREF(files); if (error) { DECREF(args->seg_readers); args->seg_readers = NULL; RETHROW(error); } }
CompoundFileReader* CFReader_do_open(CompoundFileReader *self, Folder *folder) { CharBuf *cfmeta_file = (CharBuf*)ZCB_WRAP_STR("cfmeta.json", 11); Hash *metadata = (Hash*)Json_slurp_json((Folder*)folder, cfmeta_file); Err *error = NULL; Folder_init((Folder*)self, Folder_Get_Path(folder)); // Parse metadata file. if (!metadata || !Hash_Is_A(metadata, HASH)) { error = Err_new(CB_newf("Can't read '%o' in '%o'", cfmeta_file, Folder_Get_Path(folder))); } else { Obj *format = Hash_Fetch_Str(metadata, "format", 6); self->format = format ? (int32_t)Obj_To_I64(format) : 0; self->records = (Hash*)INCREF(Hash_Fetch_Str(metadata, "files", 5)); if (self->format < 1) { error = Err_new(CB_newf("Corrupt %o file: Missing or invalid 'format'", cfmeta_file)); } else if (self->format > CFWriter_current_file_format) { error = Err_new(CB_newf("Unsupported compound file format: %i32 " "(current = %i32", self->format, CFWriter_current_file_format)); } else if (!self->records) { error = Err_new(CB_newf("Corrupt %o file: missing 'files' key", cfmeta_file)); } } DECREF(metadata); if (error) { Err_set_error(error); DECREF(self); return NULL; } // Open an instream which we'll clone over and over. CharBuf *cf_file = (CharBuf*)ZCB_WRAP_STR("cf.dat", 6); self->instream = Folder_Open_In(folder, cf_file); if (!self->instream) { ERR_ADD_FRAME(Err_get_error()); DECREF(self); return NULL; } // Assign. self->real_folder = (Folder*)INCREF(folder); // Strip directory name from filepaths for old format. if (self->format == 1) { VArray *files = Hash_Keys(self->records); ZombieCharBuf *filename = ZCB_BLANK(); ZombieCharBuf *folder_name = IxFileNames_local_part(Folder_Get_Path(folder), ZCB_BLANK()); size_t folder_name_len = ZCB_Length(folder_name); for (uint32_t i = 0, max = VA_Get_Size(files); i < max; i++) { CharBuf *orig = (CharBuf*)VA_Fetch(files, i); if (CB_Starts_With(orig, (CharBuf*)folder_name)) { Obj *record = Hash_Delete(self->records, (Obj*)orig); ZCB_Assign(filename, orig); ZCB_Nip(filename, folder_name_len + sizeof(DIR_SEP) - 1); Hash_Store(self->records, (Obj*)filename, (Obj*)record); } } DECREF(files); } return self; }
CompoundFileReader* CFReader_do_open(CompoundFileReader *self, Folder *folder) { CompoundFileReaderIVARS *const ivars = CFReader_IVARS(self); String *cfmeta_file = (String*)SSTR_WRAP_UTF8("cfmeta.json", 11); Hash *metadata = (Hash*)Json_slurp_json((Folder*)folder, cfmeta_file); Err *error = NULL; Folder_init((Folder*)self, Folder_Get_Path(folder)); // Parse metadata file. if (!metadata || !Hash_Is_A(metadata, HASH)) { error = Err_new(Str_newf("Can't read '%o' in '%o'", cfmeta_file, Folder_Get_Path(folder))); } else { Obj *format = Hash_Fetch_Utf8(metadata, "format", 6); ivars->format = format ? (int32_t)Obj_To_I64(format) : 0; ivars->records = (Hash*)INCREF(Hash_Fetch_Utf8(metadata, "files", 5)); if (ivars->format < 1) { error = Err_new(Str_newf("Corrupt %o file: Missing or invalid 'format'", cfmeta_file)); } else if (ivars->format > CFWriter_current_file_format) { error = Err_new(Str_newf("Unsupported compound file format: %i32 " "(current = %i32", ivars->format, CFWriter_current_file_format)); } else if (!ivars->records) { error = Err_new(Str_newf("Corrupt %o file: missing 'files' key", cfmeta_file)); } } DECREF(metadata); if (error) { Err_set_error(error); DECREF(self); return NULL; } // Open an instream which we'll clone over and over. String *cf_file = (String*)SSTR_WRAP_UTF8("cf.dat", 6); ivars->instream = Folder_Open_In(folder, cf_file); if (!ivars->instream) { ERR_ADD_FRAME(Err_get_error()); DECREF(self); return NULL; } // Assign. ivars->real_folder = (Folder*)INCREF(folder); // Strip directory name from filepaths for old format. if (ivars->format == 1) { Vector *files = Hash_Keys(ivars->records); String *folder_name = IxFileNames_local_part(Folder_Get_Path(folder)); size_t folder_name_len = Str_Length(folder_name); for (uint32_t i = 0, max = Vec_Get_Size(files); i < max; i++) { String *orig = (String*)Vec_Fetch(files, i); if (Str_Starts_With(orig, folder_name)) { Obj *record = Hash_Delete(ivars->records, orig); size_t offset = folder_name_len + sizeof(CHY_DIR_SEP) - 1; size_t len = Str_Length(orig) - offset; String *filename = Str_SubString(orig, offset, len); Hash_Store(ivars->records, filename, (Obj*)record); DECREF(filename); } } DECREF(folder_name); DECREF(files); } return self; }