Beispiel #1
0
/**
 * Exit Sally tool. Close open files and free memory.
 */
static void sally_exit()
{
    int ehash;
    const char *cfg_str, *hash_file;

    info_msg(1, "Flushing. Closing input and output.");
    input_close();
    output_close();

    config_lookup_string(&cfg, "features.vect_embed", &cfg_str);
    if (!strcasecmp(cfg_str, "tfidf"))
        idf_destroy(input);

    config_lookup_string(&cfg, "input.stopword_file", &cfg_str);
    if (strlen(cfg_str) > 0)
        stopwords_destroy();

    config_lookup_string(&cfg, "features.hash_file", &hash_file);
    if (strlen(hash_file) > 0) {
        info_msg(1, "Saving explicit hash table to '%s'.", hash_file);    
        gzFile z = gzopen(hash_file, "w9");
        if (!z)
            error("Could not open hash file '%s'", hash_file);
        fhash_write(z);
        gzclose(z);
    }
    
    config_lookup_int(&cfg, "features.explicit_hash", &ehash);
    if (ehash || strlen(hash_file) > 0)
        fhash_destroy();

    /* Destroy configuration */
    config_destroy(&cfg);

}
Beispiel #2
0
/* 
 * A simple test for the binary embedding
 */
int test_embed_tfidf()
{
    int i, j, n, err = 0;
    string_t strs[10];

    config_set_string(&cfg, "features.vect_norm", "none");
    config_set_string(&cfg, "features.tfidf_file", TEST_TFIDF);

    unlink(TEST_TFIDF);
    char *test_file = getenv("TEST_FILE");
    idf_create(test_file);
    test_printf("Testing TFIDF embedding");

    input_config("lines");
    n = input_open(test_file);
    input_read(strs, n);

    /* Compute IDF manually */
    config_set_string(&cfg, "features.vect_embed", "bin");
    fvec_t *w = fvec_zero();
    for (i = 0, err = 0; i < n; i++) {
        fvec_t *fv = fvec_extract(strs[i].str, strs[i].len);
        fvec_add(w, fv);
        fvec_destroy(fv);
    }
    fvec_invert(w);
    fvec_mul(w, n);
    fvec_log2(w);

    if (!idf_check(w)) {
        err++;
        test_error("(%d) internal idf values seem to be wrong", i);
    }

    /* Invert w for multiplying out IDFs */
    fvec_invert(w);

    config_set_string(&cfg, "features.vect_embed", "tfidf");
    for (i = 0, err = 0; i < n; i++) {
        fvec_t *fv = fvec_extract(strs[i].str, strs[i].len);
        fvec_times(fv, w);

        /* Check if rest tf */
        double d = 0;
        for (j = 0; j < fv->len; j++)
            d += fv->val[j];
        err += fabs(d - 1.0) > 1e-6;
        fvec_destroy(fv);
    }
    test_return(err, n);

    fvec_destroy(w);
    input_free(strs, n);
    input_close();

    idf_destroy();
    unlink(TEST_TFIDF);

    return err;
}