static void test_highlighting(TestBatchRunner *runner) { Schema *schema = Schema_new(); StandardTokenizer *tokenizer = StandardTokenizer_new(); FullTextType *plain_type = FullTextType_new((Analyzer*)tokenizer); FullTextType_Set_Highlightable(plain_type, true); FullTextType *dunked_type = FullTextType_new((Analyzer*)tokenizer); FullTextType_Set_Highlightable(dunked_type, true); FullTextType_Set_Boost(dunked_type, 0.1f); String *content = (String*)SSTR_WRAP_UTF8("content", 7); Schema_Spec_Field(schema, content, (FieldType*)plain_type); String *alt = (String*)SSTR_WRAP_UTF8("alt", 3); Schema_Spec_Field(schema, alt, (FieldType*)dunked_type); DECREF(plain_type); DECREF(dunked_type); DECREF(tokenizer); RAMFolder *folder = RAMFolder_new(NULL); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0); Doc *doc = Doc_new(NULL, 0); String *string = (String *)SSTR_WRAP_UTF8(TEST_STRING, TEST_STRING_LEN); Doc_Store(doc, content, (Obj*)string); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); doc = Doc_new(NULL, 0); string = (String *)SSTR_WRAP_UTF8("\"I see,\" said the blind man.", 28); Doc_Store(doc, content, (Obj*)string); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); doc = Doc_new(NULL, 0); string = (String *)SSTR_WRAP_UTF8("x but not why or 2ee", 20); Doc_Store(doc, content, (Obj*)string); string = (String *)SSTR_WRAP_UTF8(TEST_STRING " and extra stuff so it scores lower", TEST_STRING_LEN + 35); Doc_Store(doc, alt, (Obj*)string); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); Indexer_Commit(indexer); DECREF(indexer); Searcher *searcher = (Searcher*)IxSearcher_new((Obj*)folder); Obj *query = (Obj*)SSTR_WRAP_UTF8("\"x y z\" AND " PHI, 14); Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL); test_Raw_Excerpt(runner, searcher, query); test_Highlight_Excerpt(runner, searcher, query); test_Create_Excerpt(runner, searcher, query, hits); DECREF(hits); DECREF(searcher); DECREF(folder); DECREF(schema); }
static void S_add_document(Indexer *indexer, const char *title, const char *content) { Doc *doc = Doc_new(NULL, 0); { // Store 'title' field String *field_str = Str_newf("title"); String *value_str = Str_new_from_utf8(title, strlen(title)); Doc_Store(doc, field_str, (Obj*)value_str); DECREF(field_str); DECREF(value_str); } { // Store 'content' field String *field_str = Str_newf("content"); String *value_str = Str_new_from_utf8(content, strlen(content)); Doc_Store(doc, field_str, (Obj*)value_str); DECREF(field_str); DECREF(value_str); } Indexer_Add_Doc(indexer, doc, 1.0); DECREF(doc); }
static Folder* S_create_index() { Schema *schema = (Schema*)TestSchema_new(); RAMFolder *folder = RAMFolder_new(NULL); VArray *doc_set = TestUtils_doc_set(); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, NULL, 0); u32_t i, max; for (i = 0, max = VA_Get_Size(doc_set); i < max; i++) { static CharBuf field = ZCB_LITERAL("content"); Doc *doc = Doc_new(NULL, 0); Doc_Store(doc, &field, VA_Fetch(doc_set, i)); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); } Indexer_Commit(indexer); DECREF(doc_set); DECREF(indexer); DECREF(schema); return (Folder*)folder; }
static void test_hl_selection(TestBatchRunner *runner) { Schema *schema = Schema_new(); StandardTokenizer *tokenizer = StandardTokenizer_new(); FullTextType *plain_type = FullTextType_new((Analyzer*)tokenizer); FullTextType_Set_Highlightable(plain_type, true); String *content = (String*)SSTR_WRAP_UTF8("content", 7); Schema_Spec_Field(schema, content, (FieldType*)plain_type); DECREF(plain_type); DECREF(tokenizer); RAMFolder *folder = RAMFolder_new(NULL); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0); static char test_string[] = "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla NNN bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla MMM bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "; Doc *doc = Doc_new(NULL, 0); String *string = (String *)SSTR_WRAP_UTF8(test_string, strlen(test_string)); Doc_Store(doc, content, (Obj*)string); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); Indexer_Commit(indexer); DECREF(indexer); Searcher *searcher = (Searcher*)IxSearcher_new((Obj*)folder); Obj *query = (Obj*)SSTR_WRAP_UTF8("NNN MMM", 7); Highlighter *highlighter = Highlighter_new(searcher, query, content, 200); Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL); HitDoc *hit = Hits_Next(hits); String *excerpt = Highlighter_Create_Excerpt(highlighter, hit); String *mmm = (String*)SSTR_WRAP_UTF8("MMM", 3); String *nnn = (String*)SSTR_WRAP_UTF8("NNN", 3); TEST_TRUE(runner, Str_Find(excerpt, mmm) >= 0 || Str_Find(excerpt, nnn) >= 0, "Sentence boundary algo doesn't chop terms"); DECREF(excerpt); DECREF(hit); DECREF(hits); DECREF(highlighter); DECREF(searcher); DECREF(folder); DECREF(schema); }
static void test_simple(TestBatchRunner *runner) { RAMFolder *folder = RAMFolder_new(NULL); String *language = SSTR_WRAP_UTF8("en", 2); Simple *lucy = Simple_new((Obj*)folder, language); String *food_field = SSTR_WRAP_UTF8("food", 4); { Doc *doc = Doc_new(NULL, 0); String *value = SSTR_WRAP_UTF8("creamed corn", 12); Doc_Store(doc, food_field, (Obj*)value); Simple_Add_Doc(lucy, doc); DECREF(doc); String *query = SSTR_WRAP_UTF8("creamed", 7); uint32_t num_results = Simple_Search(lucy, query, 0, 10); TEST_INT_EQ(runner, num_results, 1, "Search works right after add"); } { Doc *doc = Doc_new(NULL, 0); String *value = SSTR_WRAP_UTF8("creamed spinach", 15); Doc_Store(doc, food_field, (Obj*)value); Simple_Add_Doc(lucy, doc); DECREF(doc); String *query = SSTR_WRAP_UTF8("creamed", 7); uint32_t num_results = Simple_Search(lucy, query, 0, 10); TEST_INT_EQ(runner, num_results, 2, "Search returns total hits"); } { Doc *doc = Doc_new(NULL, 0); String *value = SSTR_WRAP_UTF8("creamed broccoli", 16); Doc_Store(doc, food_field, (Obj*)value); Simple_Add_Doc(lucy, doc); DECREF(doc); DECREF(lucy); lucy = Simple_new((Obj*)folder, language); String *query = SSTR_WRAP_UTF8("cream", 5); uint32_t num_results = Simple_Search(lucy, query, 0, 10); TEST_INT_EQ(runner, num_results, 3, "commit upon destroy"); HitDoc *hit; while ((hit = Simple_Next(lucy)) != NULL) { String *food = (String*)HitDoc_Extract(hit, food_field); TEST_TRUE(runner, Str_Starts_With_Utf8(food, "cream", 5), "Next"); DECREF(food); DECREF(hit); } } { Doc *doc = Doc_new(NULL, 0); String *band_field = SSTR_WRAP_UTF8("band", 4); String *value = SSTR_WRAP_UTF8("Cream", 5); Doc_Store(doc, band_field, (Obj*)value); Simple_Add_Doc(lucy, doc); DECREF(doc); String *query = SSTR_WRAP_UTF8("cream", 5); uint32_t num_results = Simple_Search(lucy, query, 0, 10); TEST_INT_EQ(runner, num_results, 4, "Search uses correct EasyAnalyzer"); } DECREF(lucy); DECREF(folder); }
static Folder* build_index() { // Plain type. String *pattern = Str_newf("\\S+"); RegexTokenizer *tokenizer = RegexTokenizer_new(pattern); FullTextType *plain = FullTextType_new((Analyzer*)tokenizer); // Fancy type. String *word_pattern = Str_newf("\\w+"); RegexTokenizer *word_tokenizer = RegexTokenizer_new(word_pattern); Hash *stop_list = Hash_new(0); Hash_Store_Utf8(stop_list, "x", 1, (Obj*)CFISH_TRUE); SnowballStopFilter *stop_filter = SnowStop_new(NULL, stop_list); Vector *analyzers = Vec_new(0); Vec_Push(analyzers, (Obj*)word_tokenizer); Vec_Push(analyzers, (Obj*)stop_filter); PolyAnalyzer *fancy_analyzer = PolyAnalyzer_new(NULL, analyzers); FullTextType *fancy = FullTextType_new((Analyzer*)fancy_analyzer); // Schema. Schema *schema = Schema_new(); String *plain_str = Str_newf("plain"); String *fancy_str = Str_newf("fancy"); Schema_Spec_Field(schema, plain_str, (FieldType*)plain); Schema_Spec_Field(schema, fancy_str, (FieldType*)fancy); // Indexer. RAMFolder *folder = RAMFolder_new(NULL); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0); // Index documents. Vector *doc_set = TestUtils_doc_set(); for (uint32_t i = 0; i < Vec_Get_Size(doc_set); ++i) { String *content_string = (String*)Vec_Fetch(doc_set, i); Doc *doc = Doc_new(NULL, 0); Doc_Store(doc, plain_str, (Obj*)content_string); Doc_Store(doc, fancy_str, (Obj*)content_string); Indexer_Add_Doc(indexer, doc, 1.0); DECREF(doc); } Indexer_Commit(indexer); // Clean up. DECREF(doc_set); DECREF(indexer); DECREF(fancy_str); DECREF(plain_str); DECREF(schema); DECREF(fancy); DECREF(fancy_analyzer); DECREF(analyzers); DECREF(stop_list); DECREF(word_pattern); DECREF(plain); DECREF(tokenizer); DECREF(pattern); return (Folder*)folder; }
Indexer* Indexer_init(Indexer *self, Schema *schema, Obj *index, IndexManager *manager, int32_t flags) { bool_t create = (flags & Indexer_CREATE) ? true : false; bool_t truncate = (flags & Indexer_TRUNCATE) ? true : false; Folder *folder = S_init_folder(index, create); Lock *write_lock; CharBuf *latest_snapfile; Snapshot *latest_snapshot = Snapshot_new(); // Init. self->stock_doc = Doc_new(NULL, 0); self->truncate = false; self->optimize = false; self->prepared = false; self->needs_commit = false; self->snapfile = NULL; self->merge_lock = NULL; // Assign. self->folder = folder; self->manager = manager ? (IndexManager*)INCREF(manager) : IxManager_new(NULL, NULL); IxManager_Set_Folder(self->manager, folder); // Get a write lock for this folder. write_lock = IxManager_Make_Write_Lock(self->manager); Lock_Clear_Stale(write_lock); if (Lock_Obtain(write_lock)) { // Only assign if successful, otherwise DESTROY unlocks -- bad! self->write_lock = write_lock; } else { DECREF(write_lock); DECREF(self); RETHROW(INCREF(Err_get_error())); } // Find the latest snapshot or create a new one. latest_snapfile = IxFileNames_latest_snapshot(folder); if (latest_snapfile) { Snapshot_Read_File(latest_snapshot, folder, latest_snapfile); } // Look for an existing Schema if one wasn't supplied. if (schema) { self->schema = (Schema*)INCREF(schema); } else { if (!latest_snapfile) { THROW(ERR, "No Schema supplied, and can't find one in the index"); } else { CharBuf *schema_file = S_find_schema_file(latest_snapshot); Hash *dump = (Hash*)Json_slurp_json(folder, schema_file); if (dump) { // read file successfully self->schema = (Schema*)CERTIFY( VTable_Load_Obj(SCHEMA, (Obj*)dump), SCHEMA); schema = self->schema; DECREF(dump); schema_file = NULL; } else { THROW(ERR, "Failed to parse %o", schema_file); } } } // If we're clobbering, start with an empty Snapshot and an empty // PolyReader. Otherwise, start with the most recent Snapshot and an // up-to-date PolyReader. if (truncate) { self->snapshot = Snapshot_new(); self->polyreader = PolyReader_new(schema, folder, NULL, NULL, NULL); self->truncate = true; } else { // TODO: clone most recent snapshot rather than read it twice. self->snapshot = (Snapshot*)INCREF(latest_snapshot); self->polyreader = latest_snapfile ? PolyReader_open((Obj*)folder, NULL, NULL) : PolyReader_new(schema, folder, NULL, NULL, NULL); if (latest_snapfile) { // Make sure than any existing fields which may have been // dynamically added during past indexing sessions get added. Schema *old_schema = PolyReader_Get_Schema(self->polyreader); Schema_Eat(schema, old_schema); } } // Zap detritus from previous sessions. { // Note: we have to feed FilePurger with the most recent snapshot file // now, but with the Indexer's snapshot later. FilePurger *file_purger = FilePurger_new(folder, latest_snapshot, self->manager); FilePurger_Purge(file_purger); DECREF(file_purger); } // Create a new segment. { int64_t new_seg_num = IxManager_Highest_Seg_Num(self->manager, latest_snapshot) + 1; Lock *merge_lock = IxManager_Make_Merge_Lock(self->manager); uint32_t i, max; if (Lock_Is_Locked(merge_lock)) { // If there's a background merge process going on, stay out of its // way. Hash *merge_data = IxManager_Read_Merge_Data(self->manager); Obj *cutoff_obj = merge_data ? Hash_Fetch_Str(merge_data, "cutoff", 6) : NULL; if (!cutoff_obj) { DECREF(merge_lock); DECREF(merge_data); THROW(ERR, "Background merge detected, but can't read merge data"); } else { int64_t cutoff = Obj_To_I64(cutoff_obj); if (cutoff >= new_seg_num) { new_seg_num = cutoff + 1; } } DECREF(merge_data); } self->segment = Seg_new(new_seg_num); // Add all known fields to Segment. { VArray *fields = Schema_All_Fields(schema); for (i = 0, max = VA_Get_Size(fields); i < max; i++) { Seg_Add_Field(self->segment, (CharBuf*)VA_Fetch(fields, i)); } DECREF(fields); } DECREF(merge_lock); } // Create new SegWriter and FilePurger. self->file_purger = FilePurger_new(folder, self->snapshot, self->manager); self->seg_writer = SegWriter_new(self->schema, self->snapshot, self->segment, self->polyreader); SegWriter_Prep_Seg_Dir(self->seg_writer); // Grab a local ref to the DeletionsWriter. self->del_writer = (DeletionsWriter*)INCREF( SegWriter_Get_Del_Writer(self->seg_writer)); DECREF(latest_snapfile); DECREF(latest_snapshot); return self; }
Doc* S_parse_file(const char *filename) { size_t bytes = strlen(uscon_source) + 1 + strlen(filename) + 1; char *path = (char*)malloc(bytes); path[0] = '\0'; strcat(path, uscon_source); strcat(path, "/"); strcat(path, filename); FILE *stream = fopen(path, "r"); if (stream == NULL) { perror(path); exit(1); } char *title = NULL; char *bodytext = NULL; if (fscanf(stream, "%m[^\r\n] %m[\x01-\x7F]", &title, &bodytext) != 2) { fprintf(stderr, "Can't extract title/bodytext from '%s'", path); exit(1); } const char *category = NULL; if (S_starts_with(filename, "art")) { category = "article"; } else if (S_starts_with(filename, "amend")) { category = "amendment"; } else if (S_starts_with(filename, "preamble")) { category = "preamble"; } else { fprintf(stderr, "Can't derive category for %s", filename); exit(1); } Doc *doc = Doc_new(NULL, 0); { // Store 'title' field String *field = Str_newf("title"); String *value = Str_new_from_utf8(title, strlen(title)); Doc_Store(doc, field, (Obj*)value); DECREF(field); DECREF(value); } { // Store 'content' field String *field = Str_newf("content"); String *value = Str_new_from_utf8(bodytext, strlen(bodytext)); Doc_Store(doc, field, (Obj*)value); DECREF(field); DECREF(value); } { // Store 'url' field String *field = Str_newf("url"); String *value = Str_new_from_utf8(filename, strlen(filename)); Doc_Store(doc, field, (Obj*)value); DECREF(field); DECREF(value); } { // Store 'category' field String *field = Str_newf("category"); String *value = Str_new_from_utf8(category, strlen(category)); Doc_Store(doc, field, (Obj*)value); DECREF(field); DECREF(value); } fclose(stream); free(bodytext); free(title); free(path); return doc; }