Beispiel #1
0
/* 
 * A simple test for the binary embedding
 */
int test_embed_tfidf()
{
    int i, j, n, err = 0;
    string_t strs[10];

    config_set_string(&cfg, "features.vect_norm", "none");
    config_set_string(&cfg, "features.tfidf_file", TEST_TFIDF);

    unlink(TEST_TFIDF);
    char *test_file = getenv("TEST_FILE");
    idf_create(test_file);
    test_printf("Testing TFIDF embedding");

    input_config("lines");
    n = input_open(test_file);
    input_read(strs, n);

    /* Compute IDF manually */
    config_set_string(&cfg, "features.vect_embed", "bin");
    fvec_t *w = fvec_zero();
    for (i = 0, err = 0; i < n; i++) {
        fvec_t *fv = fvec_extract(strs[i].str, strs[i].len);
        fvec_add(w, fv);
        fvec_destroy(fv);
    }
    fvec_invert(w);
    fvec_mul(w, n);
    fvec_log2(w);

    if (!idf_check(w)) {
        err++;
        test_error("(%d) internal idf values seem to be wrong", i);
    }

    /* Invert w for multiplying out IDFs */
    fvec_invert(w);

    config_set_string(&cfg, "features.vect_embed", "tfidf");
    for (i = 0, err = 0; i < n; i++) {
        fvec_t *fv = fvec_extract(strs[i].str, strs[i].len);
        fvec_times(fv, w);

        /* Check if rest tf */
        double d = 0;
        for (j = 0; j < fv->len; j++)
            d += fv->val[j];
        err += fabs(d - 1.0) > 1e-6;
        fvec_destroy(fv);
    }
    test_return(err, n);

    fvec_destroy(w);
    input_free(strs, n);
    input_close();

    idf_destroy();
    unlink(TEST_TFIDF);

    return err;
}
Beispiel #2
0
/* 
 * A simple load and save test case
 */
int test_load_save()
{
    int i, j, err = 0;
    fvec_t *f, *g;
    gzFile *z;

    test_printf("Loading and saving of feature vectors");

    fvec_reset_delim();
    config_set_string(&cfg, "features.ngram_delim", " ");
    config_set_int(&cfg, "features.ngram_len", 2);

    /* Create and save feature vectors */
    z = gzopen(TEST_FILE, "wb9");
    if (!z) {
        printf("Could not create file (ignoring)\n");
        return FALSE;
    }

    for (i = 0; tests[i].str; i++) {
        f = fvec_extract(tests[i].str, strlen(tests[i].str), "test");
        fvec_save(f, z);
        fvec_destroy(f);
    }
    gzclose(z);


    /* Load and compare feature vectors */
    z = gzopen(TEST_FILE, "r");

    for (i = 0; tests[i].str; i++) {
        f = fvec_extract(tests[i].str, strlen(tests[i].str), "test");
        g = fvec_load(z);

        /* Check dimensions and values */
        for (j = 0; j < f->len && j < g->len; j++) {
            if (f->dim[j] != g->dim[j]) {
                test_error("(%d) f->dim[%d] != g->dim[%d]", i, j, j);
                break;
            }
            if (fabs(f->val[j] - g->val[j]) > 10e-10) {
                test_error("(%d) f->val[%d] != g->val[%d]", i, j, j);
                break;
            }
        }
        err += (j < f->len || j < g->len);

        fvec_destroy(f);
        fvec_destroy(g);
    }

    gzclose(z);
    unlink(TEST_FILE);

    test_return(err, i);
    return err;
}
Beispiel #3
0
/**
 * Loads a feature vector form a file stream
 * @param z Stream point
 * @return Feature vector
 */
fvec_t *fvec_load(gzFile * z)
{
    assert(z);
    fvec_t *f;
    char buf[512], str[512];
    int i, r;

    /* Allocate feature vector (zero'd) */
    f = calloc(1, sizeof(fvec_t));
    if (!f) {
        error("Could not load feature vector");
        return NULL;
    }

    gzgets(z, buf, 512);
    r = sscanf(buf, "feature vector: len=%lu, total=%lu, mem=%lu, src=%s\n",
               (unsigned long *) &f->len, (unsigned long *) &f->total,
               (unsigned long *) &f->mem, str);
    if (r != 4) {
        error("Could not parse feature vector");
        fvec_destroy(f);
        return NULL;
    }

    /* Set source */
    if (!strcmp(str, "(null)"))
        f->src = NULL;
    else
        f->src = strdup(str);

    /* Empty feature vector */
    if (f->len == 0)
        return f;

    /* Allocate arrays */
    f->dim = (feat_t *) malloc(f->len * sizeof(feat_t));
    f->val = (float *) malloc(f->len * sizeof(float));
    if (!f->dim || !f->val) {
        error("Could not allocate feature vector contents");
        fvec_destroy(f);
        return NULL;
    }

    /* Load features */
    for (i = 0; i < f->len; i++) {
        gzgets(z, buf, 512);
        r = sscanf(buf, "  %llx:%g\n", (unsigned long long *) &f->dim[i],
                   (float *) &f->val[i]);
        if (r != 2) {
            error("Could not parse feature vector contents");
            fvec_destroy(f);
            return NULL;
        }
    }

    return f;
}
Beispiel #4
0
/* 
 * A simple stress test for the feature table
 */
int test_stress()
{
    int i, j, err = 0;
    fvec_t *f;
    char buf[STR_LENGTH + 1];

    test_printf("Stress test for feature vectors");

    config_set_string(&cfg, "features.ngram_delim", "0");

    ftable_init();

    for (i = 0; i < STRESS_RUNS; i++) {
        config_set_int(&cfg, "features.ngram_len", rand() % 10 + 1);

        /* Create random key and string */
        for (j = 0; j < STR_LENGTH; j++)
            buf[j] = rand() % 10 + '0';
        buf[j] = 0;

        /* Extract features */
        f = fvec_extract(buf, strlen(buf), "test");
        /* Destroy features */
        fvec_destroy(f);
    }

    ftable_destroy();

    test_return(err, STRESS_RUNS);
    return err;
}
Beispiel #5
0
/* 
 * A simple static test for the feature vectors
 */
int test_static()
{
    int i, err = 0;
    fvec_t *f;

    test_printf("Extraction of feature vectors");

    for (i = 0; tests[i].str; i++) {
        fvec_reset_delim();
        config_set_string(&cfg, "features.ngram_delim", tests[i].dlm);
        config_set_int(&cfg, "features.ngram_len", tests[i].nlen);

        /* Extract features */
        f = fvec_extract(tests[i].str, strlen(tests[i].str), "test");

        /* Check for correct number of dimensions */
        if (f->len != tests[i].len) {
            test_error("(%d) len %d != %d", i, f->len, tests[i].len);
            err++;
        }

        fvec_destroy(f);
    }

    test_return(err, i);
    return err;
}
Beispiel #6
0
/* 
 * A simple test for the binary embedding
 */
int test_embed_bin()
{
    int i, j, n, err = 0;
    string_t strs[10];

    input_config("lines");
    char *test_file = getenv("TEST_FILE");
    n = input_open(test_file);
    input_read(strs, n);

    test_printf("Testing binary embedding");
    config_set_string(&cfg, "features.vect_embed", "bin");
    config_set_string(&cfg, "features.vect_norm", "none");

    for (i = 0, err = 0; i < n; i++) {
        fvec_t *fv = fvec_extract(strs[i].str, strs[i].len);
        double n = 0;
        for (j = 0; j < fv->len; j++)
            n += fv->val[j];
        err += fabs(n - fv->len) > 1e-6;
        fvec_destroy(fv);
    }
    test_return(err, n);

    input_free(strs, n);
    input_close();
    return err;
}
Beispiel #7
0
/* 
 * A simple test for the l2 norm
 */
int test_norm_l2()
{
    int i, j, n, err = 0;
    string_t strs[10];

    input_config("lines");
    char *test_file = getenv("TEST_FILE");
    n = input_open(test_file);
    input_read(strs, n);

    test_printf("Testing L2 normalization");
    config_set_string(&cfg, "features.vect_norm", "l2");
    for (i = 0, err = 0; i < n; i++) {
        fvec_t *fv = fvec_extract(strs[i].str, strs[i].len);
        double n = 0;
        for (j = 0; j < fv->len; j++)
            n += fv->val[j] * fv->val[j];
        err += fabs(sqrt(n) - 1.0) > 1e-6;
        fvec_destroy(fv);
    }
    test_return(err, n);

    input_free(strs, n);
    input_close();
    return err;
}
Beispiel #8
0
/**
 * Internal: Allocates and extracts a feature vector from a string without
 * postprocessing and no blended n-grams.
 * @param x String of bytes (with space delimiters)
 * @param l Length of sequence
 * @param n N-gram length
 * @return feature vector
 */
fvec_t *fvec_extract_intern2(char *x, int l, int n)
{
    fvec_t *fv;
    int pos;
    cfg_int shift;
    const char *dlm_str;
    assert(x && l >= 0);

    /* Allocate feature vector */
    fv = calloc(1, sizeof(fvec_t));
    if (!fv) {
        error("Could not extract feature vector");
        return NULL;
    }

    /* Get configuration */
    config_lookup_string(&cfg, "features.ngram_delim", &dlm_str);
    config_lookup_bool(&cfg, "features.ngram_pos", &pos);
    config_lookup_int(&cfg, "features.pos_shift", &shift);

    /* Check for empty sequence */
    if (l == 0)
        return fv;

    /* Sanitize shift value */
    if (!pos)
        shift = 0;

    /* Allocate arrays */
    int space = 2 * shift + 1;
    fv->dim = (feat_t *) malloc(l * sizeof(feat_t) * space);
    fv->val = (float *) malloc(l * sizeof(float) * space);

    if (!fv->dim || !fv->val) {
        error("Could not allocate feature vector contents");
        fvec_destroy(fv);
        return NULL;
    }

    /* Get configuration */
    config_lookup_string(&cfg, "features.ngram_delim", &dlm_str);

    /* Loop over position shifts (0 if pos is disabled) */
    for (int s = -shift; s <= shift; s++) {
        if (!dlm_str || strlen(dlm_str) == 0) {
            extract_ngrams(fv, x, l, n, pos, s);
        } else {
            extract_wgrams(fv, x, l, n, pos, s);
        }
    }

    /* Sort extracted features */
    qsort(fv->dim, fv->len, sizeof(feat_t), cmp_feat);

    /* Count features  */
    count_feat(fv);

    return fv;
}
Beispiel #9
0
/*
 * A stres test for the addition of feature vectors
 */
int test_stress_add()
{
    int i, j, err = 0;
    fvec_t *fx, *fy, *fz;
    char buf[STR_LENGTH + 1];

    test_printf("Stress test for addition of feature vectors");

    /* Create empty vector */
    fz = fvec_extract("aa0bb0cc", 8, "zero");
    for (i = 0; i < NUM_VECTORS; i++) {

        /* Create random key and string */
        for (j = 0; j < STR_LENGTH; j++)
            buf[j] = rand() % 10 + '0';
        buf[j] = 0;

        /* Extract features */
        fx = fvec_extract(buf, strlen(buf), "test");

        /* Add fx to fz */
        fy = fvec_add(fz, fx);
        fvec_destroy(fz);

        err += fabs(fvec_norm2(fy) - 1.4142135623) > 1e-7;

        /* Substract fx from fz */
        fz = fvec_sub(fy, fx);
        fvec_sparsify(fz);

        /* Clean up */
        fvec_destroy(fy);
        fvec_destroy(fx);
    }

    fvec_destroy(fz);
    test_return(err, i);
    return err;
}
Beispiel #10
0
/*
 * A simple static test for the dot-product of feature vectors
 */
int test_static_dot()
{
    int i, err = 0;
    fvec_t *fx, *fy;

    test_printf("Dot product of feature vectors");

    for (i = 0; test_dot[i].x; i++) {
        /* Extract features */
        fx = fvec_extract(test_dot[i].x, strlen(test_dot[i].x), "test");
        fy = fvec_extract(test_dot[i].y, strlen(test_dot[i].y), "test");

        /* Compute dot product */
        double d = fvec_dot(fx, fy);
        err += fabs(d - test_dot[i].res) > 1e-6;

        fvec_destroy(fx);
        fvec_destroy(fy);
    }

    test_return(err, i);
    return err;
}
Beispiel #11
0
/*
 * A stres test for the addition of feature vectors
 */
int test_stress_dot()
{
    int i, j, err = 0;
    fvec_t *fx, *fy;
    char buf[STR_LENGTH + 1];

    test_printf("Stress test for dot product of feature vectors");

    /* Create empty vector */
    for (i = 0; i < NUM_VECTORS; i++) {

        /* Create random key and string */
        for (j = 0; j < STR_LENGTH; j++)
            buf[j] = rand() % 10 + '0';
        buf[j] = 0;
        fx = fvec_extract(buf, strlen(buf), "test");

        /* Create random key and string */
        for (j = 0; j < STR_LENGTH; j++)
            buf[j] = rand() % 10 + '0';
        buf[j] = 0;
        fy = fvec_extract(buf, strlen(buf), "test");

        double nx = fvec_dot(fx, fx);
        double ny = fvec_dot(fy, fy);
        err += fabs(fvec_norm2(fx) - sqrt(nx)) > 1e-7;
        err += fabs(fvec_norm2(fy) - sqrt(ny)) > 1e-7;
        err += fabs(fvec_dot(fx, fy) > nx + ny);

        /* Clean up */
        fvec_destroy(fx);
        fvec_destroy(fy);
    }

    test_return(err, 3 * i);
    return err;
}
Beispiel #12
0
/*
 * A simple static test for the addition of feature vectors
 */
int test_static_add()
{
    int i, err = 0;
    fvec_t *fx, *fy, *fz;

    test_printf("Addition of feature vectors");

    for (i = 0; test_add[i].x; i++) {
        /* Extract features */
        fx = fvec_extract(test_add[i].x, strlen(test_add[i].x), "test");
        fy = fvec_extract(test_add[i].y, strlen(test_add[i].y), "test");

        /* Add test vectors */
        fz = fvec_add(fx, fy);
        err += fabs(fvec_norm1(fz) - test_add[i].res) > 1e-7;

        fvec_destroy(fz);
        fvec_destroy(fx);
        fvec_destroy(fy);
    }

    test_return(err, i);
    return err;
}
Beispiel #13
0
int test_pos_ngrams()
{
    int i, err = 0;
    fvec_t *f;

    /* Test for positional n-grams */
    test_t t[] = {
        {"b b b b b", 3, 0, 1},
        {"b b b b b", 3, 1, 3},
        {"b b b b b", 2, 0, 1},
        {"b b b b b", 2, 1, 4},
        {NULL, 0, 0, 0}
    };

    test_printf("Testing positional n-grams");

    /* Hack to set delimiters */
    config_set_string(&cfg, "features.granularity", "tokens");
    config_set_string(&cfg, "features.token_delim", " ");
    fvec_delim_set(" ");

    for (i = 0; t[i].str; i++) {

        config_set_int(&cfg, "features.ngram_len", t[i].nlen);
        config_set_bool(&cfg, "features.ngram_pos", t[i].flag);
        config_set_int(&cfg, "features.pos_shift", 0);

        /* Extract features */
        f = fvec_extract(t[i].str, strlen(t[i].str));

        /* Check for correct number of dimensions */
        if (f->len != t[i].len) {
            test_error("(%d) len %d != %d", i, f->len, t[i].len);
            err++;
        }

        fvec_destroy(f);
    }

    config_set_bool(&cfg, "features.ngram_pos", 0);

    test_return(err, i);
    return err;
}
Beispiel #14
0
/*
 * Internal: Allocates and extracts a feature vector from a string
 * without postprocessing but blended n-grams
 * @param x String of bytes (with space delimiters)
 * @param l Length of sequence
 * @return feature vector
 */
fvec_t *fvec_extract_intern(char *x, int l)
{
    int blend;
    cfg_int i, n;

    /* Get config */
    config_lookup_bool(&cfg, "features.ngram_blend", &blend);
    config_lookup_int(&cfg, "features.ngram_len", &n);

    /* Extract n-grams */
    fvec_t *fv = fvec_extract_intern2(x, l, n);

    /* Blended n-grams */
    for (i = 1; blend && i < n; i++) {
        fvec_t *fx = fvec_extract_intern2(x, l, i);
        fvec_add(fv, fx);
        fvec_destroy(fx);
    }

    return fv;
}
Beispiel #15
0
/**
 * Clones a feature vector
 * @param o Feature vector
 * @return Cloned feature vector
 */
fvec_t *fvec_clone(fvec_t *o)
{
    assert(o);
    fvec_t *fv;
    unsigned int i;

    /* Allocate feature vector */
    fv = calloc(1, sizeof(fvec_t));
    if (!fv) {
        error("Could not clone feature vector");
        return NULL;
    }

    /* Clone structure */
    fv->len = o->len;
    fv->total = o->total;
    fv->mem = o->mem;

    if (o->src)
        fv->src = strdup(o->src);

    /* Check for empty sequence */
    if (o->len == 0)
        return fv;

    fv->dim = (feat_t *) malloc(o->len * sizeof(feat_t));
    fv->val = (float *) malloc(o->len * sizeof(float));
    if (!fv->dim || !fv->val) {
        error("Could not allocate feature vector");
        fvec_destroy(fv);
        return NULL;
    }

    for (i = 0; i < o->len; i++) {
        fv->dim[i] = o->dim[i];
        fv->val[i] = o->val[i];
    }

    return fv;
}
Beispiel #16
0
/**
 * Print shared n-grams for each cluster
 * @param c Clustering structure
 * @param fa Array of feature vectors
 * @param file Output file
 */
void export_shared_ngrams(cluster_t *c, farray_t *fa, const char *file)
{
    assert(c && fa && file);
    int i, j, k;
    double shared;
    FILE *f;
    char *name = NULL;

    config_lookup_float(&cfg, "cluster.shared_ngrams", &shared);
    if (shared <= 0.0)
        return;

    if (verbose > 0)
        printf("Exporting shared n-grams with minimum ratio %4.2f.\n",
               shared);

    if (!(f = fopen(file, "a"))) {
        error("Could not create file '%s'.", file);
        return;
    }

    /* Print incremental header */
    fprintf(f, "# ---\n# Shared n-grams for %s\n", fa->src);
    fprintf(f, "# Minimum ratio of shared n-grams: %4.2f (%2.0f%%)\n",
            shared, shared * 100);
    fprintf(f, "# ---\n# <cluster> <ratio> <hash> <ngram>\n");

    /* Compute shared n-grams per cluster */
    for (i = 0; i < c->num; i++) {
        fvec_t *s = fvec_zero();

        for (j = 0, k = 0; j < c->len; j++) {
            if (c->cluster[j] != i)
                continue;

            /* Clone and binarize */
            fvec_t *x = fvec_clone(fa->x[j]);
            fvec_bin(x);

            if (k == 0)
                name = cluster_get_name(c, j);

            /* Merge n-grams in cluster */
            fvec_t *y = fvec_add(s, x);
            fvec_destroy(s);
            fvec_destroy(x);
            s = y;
            k++;
        }

        /* Check for empty cluster */
        if (k == 0)
            continue;

        fvec_div(s, k);

        /* Output shared n-grams */
        for (j = 0; j < s->len; j++) {
            if (s->val[j] < shared)
                continue;

            fprintf(f, "%s %6.4f %.16llx ", name, s->val[j],
                    (long long unsigned int) s->dim[j]);

            /* Lookup feature */
            fentry_t *fe = ftable_get(s->dim[j]);
            if (!fe)
                error("Oops. Feature not in lookup table.");

            /* Print feature */
            fprintf(f, "\"");
            for (k = 0; k < fe->len; k++) {
                if (isprint(fe->data[k]) || fe->data[k] == '%')
                    fprintf(f, "%c", fe->data[k]);
                else
                    fprintf(f, "%%%.2x", fe->data[k]);
            }
            fprintf(f, "\"\n");
        }
        fvec_destroy(s);
    }

    fclose(f);
}
Beispiel #17
0
/**
 * Allocate and extract a feature vector from a sequence.
 * There is a global table of delimiter symbols which is only 
 * initialized once the first sequence is processed. 
 * See fvec_reset_delim();
 * @param x Sequence of bytes 
 * @param l Length of sequence
 * @param s Source of features, e.g. file name
 * @return feature vector
 */
fvec_t *fvec_extract(char *x, int l, char *s)
{
    fvec_t *fv;
    int nlen;
    const char *dlm_str, *cfg_str;
    assert(x && l >= 0);

    /* Allocate feature vector */
    fv = calloc(1, sizeof(fvec_t));
    if (!fv) {
        error("Could not extract feature vector");
        return NULL;
    }

    /* Initialize feature vector */
    fv->len = 0;
    fv->total = 0;
    fv->dim = (feat_t *) malloc(l * sizeof(feat_t));
    fv->val = (float *) malloc(l * sizeof(float));
    fv->mem = sizeof(fvec_t);

    /* Set source */
    if (s) {
        fv->src = strdup(s);
        fv->mem += strlen(s);
    }

    /* Check for empty sequence */
    if (l == 0)
        return fv;

    if (!fv->dim || !fv->val) {
        error("Could not allocate feature vector");
        fvec_destroy(fv);
        return NULL;
    }

    /* Get n-gram length */
    config_lookup_int(&cfg, "features.ngram_len", (int *) &nlen);

    /* Construct delimiter lookup table */
    config_lookup_string(&cfg, "features.ngram_delim", &dlm_str);

    /* N-grams of bytes */
    if (!dlm_str || strlen(dlm_str) == 0) {
        /* Feature extraction */
        extract_ngrams(fv, x, l, nlen);
    } else {
        if (delim[0] == DELIM_NOT_INIT) {
            memset(delim, 0, 256);
            decode_delim(dlm_str);
        }

        /* Feature extraction */
        extract_wgrams(fv, x, l, nlen);
    }
    fv->total = fv->len;

    /* Sort extracted features */
    qsort(fv->dim, fv->len, sizeof(feat_t), cmp_feat);

    /* Compute embedding and condense */
    config_lookup_string(&cfg, "features.vect_embed", &cfg_str);
    if (!strcasecmp(cfg_str, "cnt")) {
        fvec_condense(fv, EMBED_CNT);
    } else if (!strcasecmp(cfg_str, "bin")) {
        fvec_condense(fv, EMBED_BIN);
    } else {
        warning("Unknown embedding '%s', using 'cnt'.", cfg_str);
        fvec_condense(fv, EMBED_CNT);
    }

    /* Compute l2 normalization */
    fvec_normalize(fv, NORM_L2);
    return fv;
}