static void test_Dump_Load_and_Equals(TestBatchRunner *runner) { if (!RegexTokenizer_is_available()) { SKIP(runner, "RegexTokenizer not available"); SKIP(runner, "RegexTokenizer not available"); SKIP(runner, "RegexTokenizer not available"); return; } StackString *word_char_pattern = SSTR_WRAP_UTF8("\\w+", 3); StackString *whitespace_pattern = SSTR_WRAP_UTF8("\\S+", 3); RegexTokenizer *word_char_tokenizer = RegexTokenizer_new((String*)word_char_pattern); RegexTokenizer *whitespace_tokenizer = RegexTokenizer_new((String*)whitespace_pattern); Obj *word_char_dump = RegexTokenizer_Dump(word_char_tokenizer); Obj *whitespace_dump = RegexTokenizer_Dump(whitespace_tokenizer); RegexTokenizer *word_char_clone = RegexTokenizer_Load(whitespace_tokenizer, word_char_dump); RegexTokenizer *whitespace_clone = RegexTokenizer_Load(whitespace_tokenizer, whitespace_dump); TEST_FALSE(runner, RegexTokenizer_Equals(word_char_tokenizer, (Obj*)whitespace_tokenizer), "Equals() false with different pattern"); TEST_TRUE(runner, RegexTokenizer_Equals(word_char_tokenizer, (Obj*)word_char_clone), "Dump => Load round trip"); TEST_TRUE(runner, RegexTokenizer_Equals(whitespace_tokenizer, (Obj*)whitespace_clone), "Dump => Load round trip"); DECREF(word_char_tokenizer); DECREF(word_char_dump); DECREF(word_char_clone); DECREF(whitespace_tokenizer); DECREF(whitespace_dump); DECREF(whitespace_clone); }
PolyAnalyzer* PolyAnalyzer_init(PolyAnalyzer *self, const CharBuf *language, VArray *analyzers) { Analyzer_init((Analyzer*)self); if (analyzers) { for (uint32_t i = 0, max = VA_Get_Size(analyzers); i < max; i++) { CERTIFY(VA_Fetch(analyzers, i), ANALYZER); } self->analyzers = (VArray*)INCREF(analyzers); } else if (language) { self->analyzers = VA_new(3); VA_Push(self->analyzers, (Obj*)CaseFolder_new()); VA_Push(self->analyzers, (Obj*)RegexTokenizer_new(NULL)); VA_Push(self->analyzers, (Obj*)SnowStemmer_new(language)); } else { THROW(ERR, "Must specify either 'language' or 'analyzers'"); } return self; }
PolyAnalyzer* PolyAnalyzer_init(PolyAnalyzer *self, String *language, Vector *analyzers) { Analyzer_init((Analyzer*)self); PolyAnalyzerIVARS *const ivars = PolyAnalyzer_IVARS(self); if (analyzers) { for (uint32_t i = 0, max = Vec_Get_Size(analyzers); i < max; i++) { CERTIFY(Vec_Fetch(analyzers, i), ANALYZER); } ivars->analyzers = (Vector*)INCREF(analyzers); } else if (language) { ivars->analyzers = Vec_new(3); Vec_Push(ivars->analyzers, (Obj*)CaseFolder_new()); Vec_Push(ivars->analyzers, (Obj*)RegexTokenizer_new(NULL)); Vec_Push(ivars->analyzers, (Obj*)SnowStemmer_new(language)); } else { THROW(ERR, "Must specify either 'language' or 'analyzers'"); } return self; }
static Folder* build_index() { // Plain type. String *pattern = Str_newf("\\S+"); RegexTokenizer *tokenizer = RegexTokenizer_new(pattern); FullTextType *plain = FullTextType_new((Analyzer*)tokenizer); // Fancy type. String *word_pattern = Str_newf("\\w+"); RegexTokenizer *word_tokenizer = RegexTokenizer_new(word_pattern); Hash *stop_list = Hash_new(0); Hash_Store_Utf8(stop_list, "x", 1, (Obj*)CFISH_TRUE); SnowballStopFilter *stop_filter = SnowStop_new(NULL, stop_list); Vector *analyzers = Vec_new(0); Vec_Push(analyzers, (Obj*)word_tokenizer); Vec_Push(analyzers, (Obj*)stop_filter); PolyAnalyzer *fancy_analyzer = PolyAnalyzer_new(NULL, analyzers); FullTextType *fancy = FullTextType_new((Analyzer*)fancy_analyzer); // Schema. Schema *schema = Schema_new(); String *plain_str = Str_newf("plain"); String *fancy_str = Str_newf("fancy"); Schema_Spec_Field(schema, plain_str, (FieldType*)plain); Schema_Spec_Field(schema, fancy_str, (FieldType*)fancy); // Indexer. RAMFolder *folder = RAMFolder_new(NULL); Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0); // Index documents. Vector *doc_set = TestUtils_doc_set(); for (uint32_t i = 0; i < Vec_Get_Size(doc_set); ++i) { String *content_string = (String*)Vec_Fetch(doc_set, i); Doc *doc = Doc_new(NULL, 0); Doc_Store(doc, plain_str, (Obj*)content_string); Doc_Store(doc, fancy_str, (Obj*)content_string); Indexer_Add_Doc(indexer, doc, 1.0); DECREF(doc); } Indexer_Commit(indexer); // Clean up. DECREF(doc_set); DECREF(indexer); DECREF(fancy_str); DECREF(plain_str); DECREF(schema); DECREF(fancy); DECREF(fancy_analyzer); DECREF(analyzers); DECREF(stop_list); DECREF(word_pattern); DECREF(plain); DECREF(tokenizer); DECREF(pattern); return (Folder*)folder; }