static void test_highlighting(TestBatchRunner *runner) { Schema *schema = Schema_new(); StandardTokenizer *tokenizer = StandardTokenizer_new(); FullTextType *plain_type = FullTextType_new((Analyzer*)tokenizer); FullTextType_Set_Highlightable(plain_type, true); FullTextType *dunked_type = FullTextType_new((Analyzer*)tokenizer); FullTextType_Set_Highlightable(dunked_type, true); FullTextType_Set_Boost(dunked_type, 0.1f); String *content = (String*)SSTR_WRAP_UTF8("content", 7); Schema_Spec_Field(schema, content, (FieldType*)plain_type); String *alt = (String*)SSTR_WRAP_UTF8("alt", 3); Schema_Spec_Field(schema, alt, (FieldType*)dunked_type); DECREF(plain_type); DECREF(dunked_type); DECREF(tokenizer); RAMFolder *folder = RAMFolder_new(NULL); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0); Doc *doc = Doc_new(NULL, 0); String *string = (String *)SSTR_WRAP_UTF8(TEST_STRING, TEST_STRING_LEN); Doc_Store(doc, content, (Obj*)string); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); doc = Doc_new(NULL, 0); string = (String *)SSTR_WRAP_UTF8("\"I see,\" said the blind man.", 28); Doc_Store(doc, content, (Obj*)string); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); doc = Doc_new(NULL, 0); string = (String *)SSTR_WRAP_UTF8("x but not why or 2ee", 20); Doc_Store(doc, content, (Obj*)string); string = (String *)SSTR_WRAP_UTF8(TEST_STRING " and extra stuff so it scores lower", TEST_STRING_LEN + 35); Doc_Store(doc, alt, (Obj*)string); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); Indexer_Commit(indexer); DECREF(indexer); Searcher *searcher = (Searcher*)IxSearcher_new((Obj*)folder); Obj *query = (Obj*)SSTR_WRAP_UTF8("\"x y z\" AND " PHI, 14); Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL); test_Raw_Excerpt(runner, searcher, query); test_Highlight_Excerpt(runner, searcher, query); test_Create_Excerpt(runner, searcher, query, hits); DECREF(hits); DECREF(searcher); DECREF(folder); DECREF(schema); }
static void S_add_document(Indexer *indexer, const char *title, const char *content) { Doc *doc = Doc_new(NULL, 0); { // Store 'title' field String *field_str = Str_newf("title"); String *value_str = Str_new_from_utf8(title, strlen(title)); Doc_Store(doc, field_str, (Obj*)value_str); DECREF(field_str); DECREF(value_str); } { // Store 'content' field String *field_str = Str_newf("content"); String *value_str = Str_new_from_utf8(content, strlen(content)); Doc_Store(doc, field_str, (Obj*)value_str); DECREF(field_str); DECREF(value_str); } Indexer_Add_Doc(indexer, doc, 1.0); DECREF(doc); }
static Folder* S_create_index() { Schema *schema = (Schema*)TestSchema_new(); RAMFolder *folder = RAMFolder_new(NULL); VArray *doc_set = TestUtils_doc_set(); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, NULL, 0); u32_t i, max; for (i = 0, max = VA_Get_Size(doc_set); i < max; i++) { static CharBuf field = ZCB_LITERAL("content"); Doc *doc = Doc_new(NULL, 0); Doc_Store(doc, &field, VA_Fetch(doc_set, i)); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); } Indexer_Commit(indexer); DECREF(doc_set); DECREF(indexer); DECREF(schema); return (Folder*)folder; }
static void test_hl_selection(TestBatchRunner *runner) { Schema *schema = Schema_new(); StandardTokenizer *tokenizer = StandardTokenizer_new(); FullTextType *plain_type = FullTextType_new((Analyzer*)tokenizer); FullTextType_Set_Highlightable(plain_type, true); String *content = (String*)SSTR_WRAP_UTF8("content", 7); Schema_Spec_Field(schema, content, (FieldType*)plain_type); DECREF(plain_type); DECREF(tokenizer); RAMFolder *folder = RAMFolder_new(NULL); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0); static char test_string[] = "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla NNN bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla MMM bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. " "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "; Doc *doc = Doc_new(NULL, 0); String *string = (String *)SSTR_WRAP_UTF8(test_string, strlen(test_string)); Doc_Store(doc, content, (Obj*)string); Indexer_Add_Doc(indexer, doc, 1.0f); DECREF(doc); Indexer_Commit(indexer); DECREF(indexer); Searcher *searcher = (Searcher*)IxSearcher_new((Obj*)folder); Obj *query = (Obj*)SSTR_WRAP_UTF8("NNN MMM", 7); Highlighter *highlighter = Highlighter_new(searcher, query, content, 200); Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL); HitDoc *hit = Hits_Next(hits); String *excerpt = Highlighter_Create_Excerpt(highlighter, hit); String *mmm = (String*)SSTR_WRAP_UTF8("MMM", 3); String *nnn = (String*)SSTR_WRAP_UTF8("NNN", 3); TEST_TRUE(runner, Str_Find(excerpt, mmm) >= 0 || Str_Find(excerpt, nnn) >= 0, "Sentence boundary algo doesn't chop terms"); DECREF(excerpt); DECREF(hit); DECREF(hits); DECREF(highlighter); DECREF(searcher); DECREF(folder); DECREF(schema); }
int main() { // Initialize the library. lucy_bootstrap_parcel(); Schema *schema = S_create_schema(); String *folder = Str_newf("%s", path_to_index); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, Indexer_CREATE | Indexer_TRUNCATE); DIR *dir = opendir(uscon_source); if (dir == NULL) { perror(uscon_source); return 1; } for (struct dirent *entry = readdir(dir); entry; entry = readdir(dir)) { if (S_ends_with(entry->d_name, ".txt")) { Doc *doc = S_parse_file(entry->d_name); Indexer_Add_Doc(indexer, doc, 1.0); DECREF(doc); } } closedir(dir); Indexer_Commit(indexer); DECREF(indexer); DECREF(folder); DECREF(schema); return 0; }
static Folder* build_index() { // Plain type. String *pattern = Str_newf("\\S+"); RegexTokenizer *tokenizer = RegexTokenizer_new(pattern); FullTextType *plain = FullTextType_new((Analyzer*)tokenizer); // Fancy type. String *word_pattern = Str_newf("\\w+"); RegexTokenizer *word_tokenizer = RegexTokenizer_new(word_pattern); Hash *stop_list = Hash_new(0); Hash_Store_Utf8(stop_list, "x", 1, (Obj*)CFISH_TRUE); SnowballStopFilter *stop_filter = SnowStop_new(NULL, stop_list); Vector *analyzers = Vec_new(0); Vec_Push(analyzers, (Obj*)word_tokenizer); Vec_Push(analyzers, (Obj*)stop_filter); PolyAnalyzer *fancy_analyzer = PolyAnalyzer_new(NULL, analyzers); FullTextType *fancy = FullTextType_new((Analyzer*)fancy_analyzer); // Schema. Schema *schema = Schema_new(); String *plain_str = Str_newf("plain"); String *fancy_str = Str_newf("fancy"); Schema_Spec_Field(schema, plain_str, (FieldType*)plain); Schema_Spec_Field(schema, fancy_str, (FieldType*)fancy); // Indexer. RAMFolder *folder = RAMFolder_new(NULL); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0); // Index documents. Vector *doc_set = TestUtils_doc_set(); for (uint32_t i = 0; i < Vec_Get_Size(doc_set); ++i) { String *content_string = (String*)Vec_Fetch(doc_set, i); Doc *doc = Doc_new(NULL, 0); Doc_Store(doc, plain_str, (Obj*)content_string); Doc_Store(doc, fancy_str, (Obj*)content_string); Indexer_Add_Doc(indexer, doc, 1.0); DECREF(doc); } Indexer_Commit(indexer); // Clean up. DECREF(doc_set); DECREF(indexer); DECREF(fancy_str); DECREF(plain_str); DECREF(schema); DECREF(fancy); DECREF(fancy_analyzer); DECREF(analyzers); DECREF(stop_list); DECREF(word_pattern); DECREF(plain); DECREF(tokenizer); DECREF(pattern); return (Folder*)folder; }