Beispiel #1
0
/**
 * Init the Sally tool
 * @param argc number of arguments
 * @param argv arguments
 */
static void sally_init()
{
    int ehash;
    const char *cfg_str;

    if (verbose > 1)
        config_print(&cfg);

    /* Set delimiters */
    config_lookup_string(&cfg, "features.ngram_delim", &cfg_str);
    if (strlen(cfg_str) > 0) 
        fvec_delim_set(cfg_str);

    /* Check for TFIDF weighting */
    config_lookup_string(&cfg, "features.vect_embed", &cfg_str);
    if (!strcasecmp(cfg_str, "tfidf"))
        idf_create(input);

    /* Load stop words */
    config_lookup_string(&cfg, "input.stopword_file", &cfg_str);
    if (strlen(cfg_str) > 0)
        stopwords_load(cfg_str);

    /* Check for feature hash table */
    config_lookup_int(&cfg, "features.explicit_hash", &ehash);
    config_lookup_string(&cfg, "features.hash_file", &cfg_str);
    if (ehash || strlen(cfg_str) > 0) {
        info_msg(1, "Enabling feature hash table.");
        fhash_init();
    }

    /* Open input */
    config_lookup_string(&cfg, "input.input_format", &cfg_str);
    input_config(cfg_str);
    info_msg(1, "Opening '%0.40s' with input module '%s'.", input, cfg_str);
    entries = input_open(input);
    if (entries < 0)
        fatal("Could not open input source");

    /* Open output */
    config_lookup_string(&cfg, "output.output_format", &cfg_str);
    output_config(cfg_str);
    info_msg(1, "Opening '%0.40s' with output module '%s'.", output, cfg_str);
    if (!output_open(output))
        fatal("Could not open output destination");
}
Beispiel #2
0
int test_pos_ngrams()
{
    int i, err = 0;
    fvec_t *f;

    /* Test for positional n-grams */
    test_t t[] = {
        {"b b b b b", 3, 0, 1},
        {"b b b b b", 3, 1, 3},
        {"b b b b b", 2, 0, 1},
        {"b b b b b", 2, 1, 4},
        {NULL, 0, 0, 0}
    };

    test_printf("Testing positional n-grams");

    /* Hack to set delimiters */
    config_set_string(&cfg, "features.granularity", "tokens");
    config_set_string(&cfg, "features.token_delim", " ");
    fvec_delim_set(" ");

    for (i = 0; t[i].str; i++) {

        config_set_int(&cfg, "features.ngram_len", t[i].nlen);
        config_set_bool(&cfg, "features.ngram_pos", t[i].flag);
        config_set_int(&cfg, "features.pos_shift", 0);

        /* Extract features */
        f = fvec_extract(t[i].str, strlen(t[i].str));

        /* Check for correct number of dimensions */
        if (f->len != t[i].len) {
            test_error("(%d) len %d != %d", i, f->len, t[i].len);
            err++;
        }

        fvec_destroy(f);
    }

    config_set_bool(&cfg, "features.ngram_pos", 0);

    test_return(err, i);
    return err;
}