/** * Exit Sally tool. Close open files and free memory. */ static void sally_exit() { int ehash; const char *cfg_str, *hash_file; info_msg(1, "Flushing. Closing input and output."); input_close(); output_close(); config_lookup_string(&cfg, "features.vect_embed", &cfg_str); if (!strcasecmp(cfg_str, "tfidf")) idf_destroy(input); config_lookup_string(&cfg, "input.stopword_file", &cfg_str); if (strlen(cfg_str) > 0) stopwords_destroy(); config_lookup_string(&cfg, "features.hash_file", &hash_file); if (strlen(hash_file) > 0) { info_msg(1, "Saving explicit hash table to '%s'.", hash_file); gzFile z = gzopen(hash_file, "w9"); if (!z) error("Could not open hash file '%s'", hash_file); fhash_write(z); gzclose(z); } config_lookup_int(&cfg, "features.explicit_hash", &ehash); if (ehash || strlen(hash_file) > 0) fhash_destroy(); /* Destroy configuration */ config_destroy(&cfg); }
/* * A simple test for the binary embedding */ int test_embed_tfidf() { int i, j, n, err = 0; string_t strs[10]; config_set_string(&cfg, "features.vect_norm", "none"); config_set_string(&cfg, "features.tfidf_file", TEST_TFIDF); unlink(TEST_TFIDF); char *test_file = getenv("TEST_FILE"); idf_create(test_file); test_printf("Testing TFIDF embedding"); input_config("lines"); n = input_open(test_file); input_read(strs, n); /* Compute IDF manually */ config_set_string(&cfg, "features.vect_embed", "bin"); fvec_t *w = fvec_zero(); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); fvec_add(w, fv); fvec_destroy(fv); } fvec_invert(w); fvec_mul(w, n); fvec_log2(w); if (!idf_check(w)) { err++; test_error("(%d) internal idf values seem to be wrong", i); } /* Invert w for multiplying out IDFs */ fvec_invert(w); config_set_string(&cfg, "features.vect_embed", "tfidf"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); fvec_times(fv, w); /* Check if rest tf */ double d = 0; for (j = 0; j < fv->len; j++) d += fv->val[j]; err += fabs(d - 1.0) > 1e-6; fvec_destroy(fv); } test_return(err, n); fvec_destroy(w); input_free(strs, n); input_close(); idf_destroy(); unlink(TEST_TFIDF); return err; }