/** * Init the Sally tool * @param argc number of arguments * @param argv arguments */ static void sally_init() { int ehash; const char *cfg_str; if (verbose > 1) config_print(&cfg); /* Set delimiters */ config_lookup_string(&cfg, "features.ngram_delim", &cfg_str); if (strlen(cfg_str) > 0) fvec_delim_set(cfg_str); /* Check for TFIDF weighting */ config_lookup_string(&cfg, "features.vect_embed", &cfg_str); if (!strcasecmp(cfg_str, "tfidf")) idf_create(input); /* Load stop words */ config_lookup_string(&cfg, "input.stopword_file", &cfg_str); if (strlen(cfg_str) > 0) stopwords_load(cfg_str); /* Check for feature hash table */ config_lookup_int(&cfg, "features.explicit_hash", &ehash); config_lookup_string(&cfg, "features.hash_file", &cfg_str); if (ehash || strlen(cfg_str) > 0) { info_msg(1, "Enabling feature hash table."); fhash_init(); } /* Open input */ config_lookup_string(&cfg, "input.input_format", &cfg_str); input_config(cfg_str); info_msg(1, "Opening '%0.40s' with input module '%s'.", input, cfg_str); entries = input_open(input); if (entries < 0) fatal("Could not open input source"); /* Open output */ config_lookup_string(&cfg, "output.output_format", &cfg_str); output_config(cfg_str); info_msg(1, "Opening '%0.40s' with output module '%s'.", output, cfg_str); if (!output_open(output)) fatal("Could not open output destination"); }
int test_pos_ngrams() { int i, err = 0; fvec_t *f; /* Test for positional n-grams */ test_t t[] = { {"b b b b b", 3, 0, 1}, {"b b b b b", 3, 1, 3}, {"b b b b b", 2, 0, 1}, {"b b b b b", 2, 1, 4}, {NULL, 0, 0, 0} }; test_printf("Testing positional n-grams"); /* Hack to set delimiters */ config_set_string(&cfg, "features.granularity", "tokens"); config_set_string(&cfg, "features.token_delim", " "); fvec_delim_set(" "); for (i = 0; t[i].str; i++) { config_set_int(&cfg, "features.ngram_len", t[i].nlen); config_set_bool(&cfg, "features.ngram_pos", t[i].flag); config_set_int(&cfg, "features.pos_shift", 0); /* Extract features */ f = fvec_extract(t[i].str, strlen(t[i].str)); /* Check for correct number of dimensions */ if (f->len != t[i].len) { test_error("(%d) len %d != %d", i, f->len, t[i].len); err++; } fvec_destroy(f); } config_set_bool(&cfg, "features.ngram_pos", 0); test_return(err, i); return err; }