/* * A simple test for the binary embedding */ int test_embed_tfidf() { int i, j, n, err = 0; string_t strs[10]; config_set_string(&cfg, "features.vect_norm", "none"); config_set_string(&cfg, "features.tfidf_file", TEST_TFIDF); unlink(TEST_TFIDF); char *test_file = getenv("TEST_FILE"); idf_create(test_file); test_printf("Testing TFIDF embedding"); input_config("lines"); n = input_open(test_file); input_read(strs, n); /* Compute IDF manually */ config_set_string(&cfg, "features.vect_embed", "bin"); fvec_t *w = fvec_zero(); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); fvec_add(w, fv); fvec_destroy(fv); } fvec_invert(w); fvec_mul(w, n); fvec_log2(w); if (!idf_check(w)) { err++; test_error("(%d) internal idf values seem to be wrong", i); } /* Invert w for multiplying out IDFs */ fvec_invert(w); config_set_string(&cfg, "features.vect_embed", "tfidf"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); fvec_times(fv, w); /* Check if rest tf */ double d = 0; for (j = 0; j < fv->len; j++) d += fv->val[j]; err += fabs(d - 1.0) > 1e-6; fvec_destroy(fv); } test_return(err, n); fvec_destroy(w); input_free(strs, n); input_close(); idf_destroy(); unlink(TEST_TFIDF); return err; }
/* * A simple test for the binary embedding */ int test_embed_bin() { int i, j, n, err = 0; string_t strs[10]; input_config("lines"); char *test_file = getenv("TEST_FILE"); n = input_open(test_file); input_read(strs, n); test_printf("Testing binary embedding"); config_set_string(&cfg, "features.vect_embed", "bin"); config_set_string(&cfg, "features.vect_norm", "none"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); double n = 0; for (j = 0; j < fv->len; j++) n += fv->val[j]; err += fabs(n - fv->len) > 1e-6; fvec_destroy(fv); } test_return(err, n); input_free(strs, n); input_close(); return err; }
/* * A simple test for the l2 norm */ int test_norm_l2() { int i, j, n, err = 0; string_t strs[10]; input_config("lines"); char *test_file = getenv("TEST_FILE"); n = input_open(test_file); input_read(strs, n); test_printf("Testing L2 normalization"); config_set_string(&cfg, "features.vect_norm", "l2"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); double n = 0; for (j = 0; j < fv->len; j++) n += fv->val[j] * fv->val[j]; err += fabs(sqrt(n) - 1.0) > 1e-6; fvec_destroy(fv); } test_return(err, n); input_free(strs, n); input_close(); return err; }
/** * Load a settings section from the configuration file. */ static int Configuration_LoadSection(const char *pFilename, const struct Config_Tag configs[], const char *pSection) { int ret; ret = input_config(pFilename, configs, pSection); if (ret < 0) fprintf(stderr, "Can not load configuration file %s (section %s).\n", pFilename, pSection); return ret; }
/** * Init the Sally tool * @param argc number of arguments * @param argv arguments */ static void sally_init() { int ehash; const char *cfg_str; if (verbose > 1) config_print(&cfg); /* Set delimiters */ config_lookup_string(&cfg, "features.ngram_delim", &cfg_str); if (strlen(cfg_str) > 0) fvec_delim_set(cfg_str); /* Check for TFIDF weighting */ config_lookup_string(&cfg, "features.vect_embed", &cfg_str); if (!strcasecmp(cfg_str, "tfidf")) idf_create(input); /* Load stop words */ config_lookup_string(&cfg, "input.stopword_file", &cfg_str); if (strlen(cfg_str) > 0) stopwords_load(cfg_str); /* Check for feature hash table */ config_lookup_int(&cfg, "features.explicit_hash", &ehash); config_lookup_string(&cfg, "features.hash_file", &cfg_str); if (ehash || strlen(cfg_str) > 0) { info_msg(1, "Enabling feature hash table."); fhash_init(); } /* Open input */ config_lookup_string(&cfg, "input.input_format", &cfg_str); input_config(cfg_str); info_msg(1, "Opening '%0.40s' with input module '%s'.", input, cfg_str); entries = input_open(input); if (entries < 0) fatal("Could not open input source"); /* Open output */ config_lookup_string(&cfg, "output.output_format", &cfg_str); output_config(cfg_str); info_msg(1, "Opening '%0.40s' with output module '%s'.", output, cfg_str); if (!output_open(output)) fatal("Could not open output destination"); }