static void test_stemming(TestBatchRunner *runner) { FSFolder *modules_folder = TestUtils_modules_folder(); String *path = Str_newf("analysis/snowstem/source/test/tests.json"); Hash *tests = (Hash*)Json_slurp_json((Folder*)modules_folder, path); if (!tests) { RETHROW(Err_get_error()); } String *iso; Hash *lang_data; Hash_Iterate(tests); while (Hash_Next(tests, (Obj**)&iso, (Obj**)&lang_data)) { VArray *words = (VArray*)Hash_Fetch_Utf8(lang_data, "words", 5); VArray *stems = (VArray*)Hash_Fetch_Utf8(lang_data, "stems", 5); SnowballStemmer *stemmer = SnowStemmer_new(iso); for (uint32_t i = 0, max = VA_Get_Size(words); i < max; i++) { String *word = (String*)VA_Fetch(words, i); VArray *got = SnowStemmer_Split(stemmer, word); String *stem = (String*)VA_Fetch(got, 0); TEST_TRUE(runner, stem && Str_Is_A(stem, STRING) && Str_Equals(stem, VA_Fetch(stems, i)), "Stem %s: %s", Str_Get_Ptr8(iso), Str_Get_Ptr8(word) ); DECREF(got); } DECREF(stemmer); } DECREF(tests); DECREF(modules_folder); DECREF(path); }
static void test_normalization(TestBatchRunner *runner) { FSFolder *modules_folder = TestUtils_modules_folder(); if (modules_folder == NULL) { SKIP(runner, 13, "Can't locate test data"); return; } String *path = Str_newf("unicode/utf8proc/tests.json"); Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path); if (!tests) { RETHROW(Err_get_error()); } for (uint32_t i = 0, max = Vec_Get_Size(tests); i < max; i++) { Hash *test = (Hash*)Vec_Fetch(tests, i); String *form = (String*)Hash_Fetch_Utf8( test, "normalization_form", 18); bool case_fold = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8( test, "case_fold", 9)); bool strip_accents = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8( test, "strip_accents", 13)); Normalizer *normalizer = Normalizer_new(form, case_fold, strip_accents); Vector *words = (Vector*)Hash_Fetch_Utf8(test, "words", 5); Vector *norms = (Vector*)Hash_Fetch_Utf8(test, "norms", 5); for (uint32_t j = 0, max = Vec_Get_Size(words); j < max; j++) { String *word = (String*)Vec_Fetch(words, j); Vector *got = Normalizer_Split(normalizer, word); String *norm = (String*)Vec_Fetch(got, 0); TEST_TRUE(runner, norm && Str_is_a(norm, STRING) && Str_Equals(norm, Vec_Fetch(norms, j)), "Normalize %s %d %d: %s", Str_Get_Ptr8(form), case_fold, strip_accents, Str_Get_Ptr8(word) ); DECREF(got); } DECREF(normalizer); } DECREF(tests); DECREF(modules_folder); DECREF(path); }
static void test_tokenizer(TestBatchRunner *runner) { StandardTokenizer *tokenizer = StandardTokenizer_new(); String *word = SSTR_WRAP_C( " ." "tha\xCC\x82t's" ":" "1,02\xC2\xADZ4.38" "\xE0\xB8\x81\xC2\xAD\xC2\xAD" "\xF0\xA0\x80\x80" "a" "/"); Vector *got = StandardTokenizer_Split(tokenizer, word); String *token = (String*)Vec_Fetch(got, 0); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "tha\xcc\x82t's", 8), "Token: %s", Str_Get_Ptr8(token)); token = (String*)Vec_Fetch(got, 1); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "1,02\xC2\xADZ4.38", 11), "Token: %s", Str_Get_Ptr8(token)); token = (String*)Vec_Fetch(got, 2); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "\xE0\xB8\x81\xC2\xAD\xC2\xAD", 7), "Token: %s", Str_Get_Ptr8(token)); token = (String*)Vec_Fetch(got, 3); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "\xF0\xA0\x80\x80", 4), "Token: %s", Str_Get_Ptr8(token)); token = (String*)Vec_Fetch(got, 4); TEST_TRUE(runner, token && Str_is_a(token, STRING) && Str_Equals_Utf8(token, "a", 1), "Token: %s", Str_Get_Ptr8(token)); DECREF(got); FSFolder *modules_folder = TestUtils_modules_folder(); if (modules_folder == NULL) { SKIP(runner, 1372, "Can't locate test data"); } else { String *path = Str_newf("unicode/ucd/WordBreakTest.json"); Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path); if (!tests) { RETHROW(Err_get_error()); } for (uint32_t i = 0, max = Vec_Get_Size(tests); i < max; i++) { Hash *test = (Hash*)Vec_Fetch(tests, i); String *text = (String*)Hash_Fetch_Utf8(test, "text", 4); Vector *wanted = (Vector*)Hash_Fetch_Utf8(test, "words", 5); Vector *got = StandardTokenizer_Split(tokenizer, text); TEST_TRUE(runner, Vec_Equals(wanted, (Obj*)got), "UCD test #%d", i + 1); DECREF(got); } DECREF(tests); DECREF(modules_folder); DECREF(path); } DECREF(tokenizer); }