Esempio n. 1
0
static void
test_stemming(TestBatchRunner *runner) {
    FSFolder *modules_folder = TestUtils_modules_folder();
    String *path = Str_newf("analysis/snowstem/source/test/tests.json");
    Hash *tests = (Hash*)Json_slurp_json((Folder*)modules_folder, path);
    if (!tests) { RETHROW(Err_get_error()); }

    String *iso;
    Hash *lang_data;
    Hash_Iterate(tests);
    while (Hash_Next(tests, (Obj**)&iso, (Obj**)&lang_data)) {
        VArray *words = (VArray*)Hash_Fetch_Utf8(lang_data, "words", 5);
        VArray *stems = (VArray*)Hash_Fetch_Utf8(lang_data, "stems", 5);
        SnowballStemmer *stemmer = SnowStemmer_new(iso);
        for (uint32_t i = 0, max = VA_Get_Size(words); i < max; i++) {
            String *word  = (String*)VA_Fetch(words, i);
            VArray *got   = SnowStemmer_Split(stemmer, word);
            String *stem  = (String*)VA_Fetch(got, 0);
            TEST_TRUE(runner,
                      stem
                      && Str_Is_A(stem, STRING)
                      && Str_Equals(stem, VA_Fetch(stems, i)),
                      "Stem %s: %s", Str_Get_Ptr8(iso), Str_Get_Ptr8(word)
                     );
            DECREF(got);
        }
        DECREF(stemmer);
    }

    DECREF(tests);
    DECREF(modules_folder);
    DECREF(path);
}
Esempio n. 2
0
static void
test_normalization(TestBatchRunner *runner) {
    FSFolder *modules_folder = TestUtils_modules_folder();
    if (modules_folder == NULL) {
        SKIP(runner, 13, "Can't locate test data");
        return;
    }

    String *path = Str_newf("unicode/utf8proc/tests.json");
    Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path);
    if (!tests) { RETHROW(Err_get_error()); }

    for (uint32_t i = 0, max = Vec_Get_Size(tests); i < max; i++) {
        Hash *test = (Hash*)Vec_Fetch(tests, i);
        String *form = (String*)Hash_Fetch_Utf8(
                            test, "normalization_form", 18);
        bool case_fold = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8(
                                              test, "case_fold", 9));
        bool strip_accents = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8(
                                                  test, "strip_accents", 13));
        Normalizer *normalizer = Normalizer_new(form, case_fold, strip_accents);
        Vector *words = (Vector*)Hash_Fetch_Utf8(test, "words", 5);
        Vector *norms = (Vector*)Hash_Fetch_Utf8(test, "norms", 5);
        for (uint32_t j = 0, max = Vec_Get_Size(words); j < max; j++) {
            String *word = (String*)Vec_Fetch(words, j);
            Vector *got  = Normalizer_Split(normalizer, word);
            String *norm = (String*)Vec_Fetch(got, 0);
            TEST_TRUE(runner,
                      norm
                      && Str_is_a(norm, STRING)
                      && Str_Equals(norm, Vec_Fetch(norms, j)),
                      "Normalize %s %d %d: %s", Str_Get_Ptr8(form),
                      case_fold, strip_accents, Str_Get_Ptr8(word)
                     );
            DECREF(got);
        }
        DECREF(normalizer);
    }

    DECREF(tests);
    DECREF(modules_folder);
    DECREF(path);
}
Esempio n. 3
0
static void
test_tokenizer(TestBatchRunner *runner) {
    StandardTokenizer *tokenizer = StandardTokenizer_new();

    String *word = SSTR_WRAP_C(
                              " ."
                              "tha\xCC\x82t's"
                              ":"
                              "1,02\xC2\xADZ4.38"
                              "\xE0\xB8\x81\xC2\xAD\xC2\xAD"
                              "\xF0\xA0\x80\x80"
                              "a"
                              "/");
    Vector *got = StandardTokenizer_Split(tokenizer, word);
    String *token = (String*)Vec_Fetch(got, 0);
    TEST_TRUE(runner,
              token
              && Str_is_a(token, STRING)
              && Str_Equals_Utf8(token, "tha\xcc\x82t's", 8),
              "Token: %s", Str_Get_Ptr8(token));
    token = (String*)Vec_Fetch(got, 1);
    TEST_TRUE(runner,
              token
              && Str_is_a(token, STRING)
              && Str_Equals_Utf8(token, "1,02\xC2\xADZ4.38", 11),
              "Token: %s", Str_Get_Ptr8(token));
    token = (String*)Vec_Fetch(got, 2);
    TEST_TRUE(runner,
              token
              && Str_is_a(token, STRING)
              && Str_Equals_Utf8(token, "\xE0\xB8\x81\xC2\xAD\xC2\xAD", 7),
              "Token: %s", Str_Get_Ptr8(token));
    token = (String*)Vec_Fetch(got, 3);
    TEST_TRUE(runner,
              token
              && Str_is_a(token, STRING)
              && Str_Equals_Utf8(token, "\xF0\xA0\x80\x80", 4),
              "Token: %s", Str_Get_Ptr8(token));
    token = (String*)Vec_Fetch(got, 4);
    TEST_TRUE(runner,
              token
              && Str_is_a(token, STRING)
              && Str_Equals_Utf8(token, "a", 1),
              "Token: %s", Str_Get_Ptr8(token));
    DECREF(got);

    FSFolder *modules_folder = TestUtils_modules_folder();
    if (modules_folder == NULL) {
        SKIP(runner, 1372, "Can't locate test data");
    }
    else {
        String *path = Str_newf("unicode/ucd/WordBreakTest.json");
        Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path);
        if (!tests) { RETHROW(Err_get_error()); }

        for (uint32_t i = 0, max = Vec_Get_Size(tests); i < max; i++) {
            Hash *test = (Hash*)Vec_Fetch(tests, i);
            String *text = (String*)Hash_Fetch_Utf8(test, "text", 4);
            Vector *wanted = (Vector*)Hash_Fetch_Utf8(test, "words", 5);
            Vector *got = StandardTokenizer_Split(tokenizer, text);
            TEST_TRUE(runner, Vec_Equals(wanted, (Obj*)got), "UCD test #%d", i + 1);
            DECREF(got);
        }

        DECREF(tests);
        DECREF(modules_folder);
        DECREF(path);
    }

    DECREF(tokenizer);
}