static void test_Dump_Load_and_Equals(TestBatchRunner *runner) { String *EN = (String*)SSTR_WRAP_UTF8("en", 2); String *ES = (String*)SSTR_WRAP_UTF8("es", 2); SnowballStemmer *stemmer = SnowStemmer_new(EN); SnowballStemmer *other = SnowStemmer_new(ES); Obj *dump = (Obj*)SnowStemmer_Dump(stemmer); Obj *other_dump = (Obj*)SnowStemmer_Dump(other); SnowballStemmer *clone = (SnowballStemmer*)SnowStemmer_Load(other, dump); SnowballStemmer *other_clone = (SnowballStemmer*)SnowStemmer_Load(other, other_dump); TEST_FALSE(runner, SnowStemmer_Equals(stemmer, (Obj*)other), "Equals() false with different language"); TEST_TRUE(runner, SnowStemmer_Equals(stemmer, (Obj*)clone), "Dump => Load round trip"); TEST_TRUE(runner, SnowStemmer_Equals(other, (Obj*)other_clone), "Dump => Load round trip"); DECREF(stemmer); DECREF(dump); DECREF(clone); DECREF(other); DECREF(other_dump); DECREF(other_clone); }
static void test_stemming(TestBatchRunner *runner) { FSFolder *modules_folder = TestUtils_modules_folder(); String *path = Str_newf("analysis/snowstem/source/test/tests.json"); Hash *tests = (Hash*)Json_slurp_json((Folder*)modules_folder, path); if (!tests) { RETHROW(Err_get_error()); } String *iso; Hash *lang_data; Hash_Iterate(tests); while (Hash_Next(tests, (Obj**)&iso, (Obj**)&lang_data)) { VArray *words = (VArray*)Hash_Fetch_Utf8(lang_data, "words", 5); VArray *stems = (VArray*)Hash_Fetch_Utf8(lang_data, "stems", 5); SnowballStemmer *stemmer = SnowStemmer_new(iso); for (uint32_t i = 0, max = VA_Get_Size(words); i < max; i++) { String *word = (String*)VA_Fetch(words, i); VArray *got = SnowStemmer_Split(stemmer, word); String *stem = (String*)VA_Fetch(got, 0); TEST_TRUE(runner, stem && Str_Is_A(stem, STRING) && Str_Equals(stem, VA_Fetch(stems, i)), "Stem %s: %s", Str_Get_Ptr8(iso), Str_Get_Ptr8(word) ); DECREF(got); } DECREF(stemmer); } DECREF(tests); DECREF(modules_folder); DECREF(path); }
EasyAnalyzer* EasyAnalyzer_init(EasyAnalyzer *self, const CharBuf *language) { Analyzer_init((Analyzer*)self); EasyAnalyzerIVARS *const ivars = EasyAnalyzer_IVARS(self); ivars->language = CB_Clone(language); ivars->tokenizer = StandardTokenizer_new(); ivars->normalizer = Normalizer_new(NULL, true, false); ivars->stemmer = SnowStemmer_new(language); return self; }
PolyAnalyzer* PolyAnalyzer_init(PolyAnalyzer *self, const CharBuf *language, VArray *analyzers) { Analyzer_init((Analyzer*)self); if (analyzers) { for (uint32_t i = 0, max = VA_Get_Size(analyzers); i < max; i++) { CERTIFY(VA_Fetch(analyzers, i), ANALYZER); } self->analyzers = (VArray*)INCREF(analyzers); } else if (language) { self->analyzers = VA_new(3); VA_Push(self->analyzers, (Obj*)CaseFolder_new()); VA_Push(self->analyzers, (Obj*)RegexTokenizer_new(NULL)); VA_Push(self->analyzers, (Obj*)SnowStemmer_new(language)); } else { THROW(ERR, "Must specify either 'language' or 'analyzers'"); } return self; }
PolyAnalyzer* PolyAnalyzer_init(PolyAnalyzer *self, String *language, Vector *analyzers) { Analyzer_init((Analyzer*)self); PolyAnalyzerIVARS *const ivars = PolyAnalyzer_IVARS(self); if (analyzers) { for (uint32_t i = 0, max = Vec_Get_Size(analyzers); i < max; i++) { CERTIFY(Vec_Fetch(analyzers, i), ANALYZER); } ivars->analyzers = (Vector*)INCREF(analyzers); } else if (language) { ivars->analyzers = Vec_new(3); Vec_Push(ivars->analyzers, (Obj*)CaseFolder_new()); Vec_Push(ivars->analyzers, (Obj*)RegexTokenizer_new(NULL)); Vec_Push(ivars->analyzers, (Obj*)SnowStemmer_new(language)); } else { THROW(ERR, "Must specify either 'language' or 'analyzers'"); } return self; }