/* * A simple test for the binary embedding */ int test_embed_tfidf() { int i, j, n, err = 0; string_t strs[10]; config_set_string(&cfg, "features.vect_norm", "none"); config_set_string(&cfg, "features.tfidf_file", TEST_TFIDF); unlink(TEST_TFIDF); char *test_file = getenv("TEST_FILE"); idf_create(test_file); test_printf("Testing TFIDF embedding"); input_config("lines"); n = input_open(test_file); input_read(strs, n); /* Compute IDF manually */ config_set_string(&cfg, "features.vect_embed", "bin"); fvec_t *w = fvec_zero(); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); fvec_add(w, fv); fvec_destroy(fv); } fvec_invert(w); fvec_mul(w, n); fvec_log2(w); if (!idf_check(w)) { err++; test_error("(%d) internal idf values seem to be wrong", i); } /* Invert w for multiplying out IDFs */ fvec_invert(w); config_set_string(&cfg, "features.vect_embed", "tfidf"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); fvec_times(fv, w); /* Check if rest tf */ double d = 0; for (j = 0; j < fv->len; j++) d += fv->val[j]; err += fabs(d - 1.0) > 1e-6; fvec_destroy(fv); } test_return(err, n); fvec_destroy(w); input_free(strs, n); input_close(); idf_destroy(); unlink(TEST_TFIDF); return err; }
/* * A simple load and save test case */ int test_load_save() { int i, j, err = 0; fvec_t *f, *g; gzFile *z; test_printf("Loading and saving of feature vectors"); fvec_reset_delim(); config_set_string(&cfg, "features.ngram_delim", " "); config_set_int(&cfg, "features.ngram_len", 2); /* Create and save feature vectors */ z = gzopen(TEST_FILE, "wb9"); if (!z) { printf("Could not create file (ignoring)\n"); return FALSE; } for (i = 0; tests[i].str; i++) { f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); fvec_save(f, z); fvec_destroy(f); } gzclose(z); /* Load and compare feature vectors */ z = gzopen(TEST_FILE, "r"); for (i = 0; tests[i].str; i++) { f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); g = fvec_load(z); /* Check dimensions and values */ for (j = 0; j < f->len && j < g->len; j++) { if (f->dim[j] != g->dim[j]) { test_error("(%d) f->dim[%d] != g->dim[%d]", i, j, j); break; } if (fabs(f->val[j] - g->val[j]) > 10e-10) { test_error("(%d) f->val[%d] != g->val[%d]", i, j, j); break; } } err += (j < f->len || j < g->len); fvec_destroy(f); fvec_destroy(g); } gzclose(z); unlink(TEST_FILE); test_return(err, i); return err; }
/** * Loads a feature vector form a file stream * @param z Stream point * @return Feature vector */ fvec_t *fvec_load(gzFile * z) { assert(z); fvec_t *f; char buf[512], str[512]; int i, r; /* Allocate feature vector (zero'd) */ f = calloc(1, sizeof(fvec_t)); if (!f) { error("Could not load feature vector"); return NULL; } gzgets(z, buf, 512); r = sscanf(buf, "feature vector: len=%lu, total=%lu, mem=%lu, src=%s\n", (unsigned long *) &f->len, (unsigned long *) &f->total, (unsigned long *) &f->mem, str); if (r != 4) { error("Could not parse feature vector"); fvec_destroy(f); return NULL; } /* Set source */ if (!strcmp(str, "(null)")) f->src = NULL; else f->src = strdup(str); /* Empty feature vector */ if (f->len == 0) return f; /* Allocate arrays */ f->dim = (feat_t *) malloc(f->len * sizeof(feat_t)); f->val = (float *) malloc(f->len * sizeof(float)); if (!f->dim || !f->val) { error("Could not allocate feature vector contents"); fvec_destroy(f); return NULL; } /* Load features */ for (i = 0; i < f->len; i++) { gzgets(z, buf, 512); r = sscanf(buf, " %llx:%g\n", (unsigned long long *) &f->dim[i], (float *) &f->val[i]); if (r != 2) { error("Could not parse feature vector contents"); fvec_destroy(f); return NULL; } } return f; }
/* * A simple stress test for the feature table */ int test_stress() { int i, j, err = 0; fvec_t *f; char buf[STR_LENGTH + 1]; test_printf("Stress test for feature vectors"); config_set_string(&cfg, "features.ngram_delim", "0"); ftable_init(); for (i = 0; i < STRESS_RUNS; i++) { config_set_int(&cfg, "features.ngram_len", rand() % 10 + 1); /* Create random key and string */ for (j = 0; j < STR_LENGTH; j++) buf[j] = rand() % 10 + '0'; buf[j] = 0; /* Extract features */ f = fvec_extract(buf, strlen(buf), "test"); /* Destroy features */ fvec_destroy(f); } ftable_destroy(); test_return(err, STRESS_RUNS); return err; }
/* * A simple static test for the feature vectors */ int test_static() { int i, err = 0; fvec_t *f; test_printf("Extraction of feature vectors"); for (i = 0; tests[i].str; i++) { fvec_reset_delim(); config_set_string(&cfg, "features.ngram_delim", tests[i].dlm); config_set_int(&cfg, "features.ngram_len", tests[i].nlen); /* Extract features */ f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); /* Check for correct number of dimensions */ if (f->len != tests[i].len) { test_error("(%d) len %d != %d", i, f->len, tests[i].len); err++; } fvec_destroy(f); } test_return(err, i); return err; }
/* * A simple test for the binary embedding */ int test_embed_bin() { int i, j, n, err = 0; string_t strs[10]; input_config("lines"); char *test_file = getenv("TEST_FILE"); n = input_open(test_file); input_read(strs, n); test_printf("Testing binary embedding"); config_set_string(&cfg, "features.vect_embed", "bin"); config_set_string(&cfg, "features.vect_norm", "none"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); double n = 0; for (j = 0; j < fv->len; j++) n += fv->val[j]; err += fabs(n - fv->len) > 1e-6; fvec_destroy(fv); } test_return(err, n); input_free(strs, n); input_close(); return err; }
/* * A simple test for the l2 norm */ int test_norm_l2() { int i, j, n, err = 0; string_t strs[10]; input_config("lines"); char *test_file = getenv("TEST_FILE"); n = input_open(test_file); input_read(strs, n); test_printf("Testing L2 normalization"); config_set_string(&cfg, "features.vect_norm", "l2"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); double n = 0; for (j = 0; j < fv->len; j++) n += fv->val[j] * fv->val[j]; err += fabs(sqrt(n) - 1.0) > 1e-6; fvec_destroy(fv); } test_return(err, n); input_free(strs, n); input_close(); return err; }
/** * Internal: Allocates and extracts a feature vector from a string without * postprocessing and no blended n-grams. * @param x String of bytes (with space delimiters) * @param l Length of sequence * @param n N-gram length * @return feature vector */ fvec_t *fvec_extract_intern2(char *x, int l, int n) { fvec_t *fv; int pos; cfg_int shift; const char *dlm_str; assert(x && l >= 0); /* Allocate feature vector */ fv = calloc(1, sizeof(fvec_t)); if (!fv) { error("Could not extract feature vector"); return NULL; } /* Get configuration */ config_lookup_string(&cfg, "features.ngram_delim", &dlm_str); config_lookup_bool(&cfg, "features.ngram_pos", &pos); config_lookup_int(&cfg, "features.pos_shift", &shift); /* Check for empty sequence */ if (l == 0) return fv; /* Sanitize shift value */ if (!pos) shift = 0; /* Allocate arrays */ int space = 2 * shift + 1; fv->dim = (feat_t *) malloc(l * sizeof(feat_t) * space); fv->val = (float *) malloc(l * sizeof(float) * space); if (!fv->dim || !fv->val) { error("Could not allocate feature vector contents"); fvec_destroy(fv); return NULL; } /* Get configuration */ config_lookup_string(&cfg, "features.ngram_delim", &dlm_str); /* Loop over position shifts (0 if pos is disabled) */ for (int s = -shift; s <= shift; s++) { if (!dlm_str || strlen(dlm_str) == 0) { extract_ngrams(fv, x, l, n, pos, s); } else { extract_wgrams(fv, x, l, n, pos, s); } } /* Sort extracted features */ qsort(fv->dim, fv->len, sizeof(feat_t), cmp_feat); /* Count features */ count_feat(fv); return fv; }
/* * A stres test for the addition of feature vectors */ int test_stress_add() { int i, j, err = 0; fvec_t *fx, *fy, *fz; char buf[STR_LENGTH + 1]; test_printf("Stress test for addition of feature vectors"); /* Create empty vector */ fz = fvec_extract("aa0bb0cc", 8, "zero"); for (i = 0; i < NUM_VECTORS; i++) { /* Create random key and string */ for (j = 0; j < STR_LENGTH; j++) buf[j] = rand() % 10 + '0'; buf[j] = 0; /* Extract features */ fx = fvec_extract(buf, strlen(buf), "test"); /* Add fx to fz */ fy = fvec_add(fz, fx); fvec_destroy(fz); err += fabs(fvec_norm2(fy) - 1.4142135623) > 1e-7; /* Substract fx from fz */ fz = fvec_sub(fy, fx); fvec_sparsify(fz); /* Clean up */ fvec_destroy(fy); fvec_destroy(fx); } fvec_destroy(fz); test_return(err, i); return err; }
/* * A simple static test for the dot-product of feature vectors */ int test_static_dot() { int i, err = 0; fvec_t *fx, *fy; test_printf("Dot product of feature vectors"); for (i = 0; test_dot[i].x; i++) { /* Extract features */ fx = fvec_extract(test_dot[i].x, strlen(test_dot[i].x), "test"); fy = fvec_extract(test_dot[i].y, strlen(test_dot[i].y), "test"); /* Compute dot product */ double d = fvec_dot(fx, fy); err += fabs(d - test_dot[i].res) > 1e-6; fvec_destroy(fx); fvec_destroy(fy); } test_return(err, i); return err; }
/* * A stres test for the addition of feature vectors */ int test_stress_dot() { int i, j, err = 0; fvec_t *fx, *fy; char buf[STR_LENGTH + 1]; test_printf("Stress test for dot product of feature vectors"); /* Create empty vector */ for (i = 0; i < NUM_VECTORS; i++) { /* Create random key and string */ for (j = 0; j < STR_LENGTH; j++) buf[j] = rand() % 10 + '0'; buf[j] = 0; fx = fvec_extract(buf, strlen(buf), "test"); /* Create random key and string */ for (j = 0; j < STR_LENGTH; j++) buf[j] = rand() % 10 + '0'; buf[j] = 0; fy = fvec_extract(buf, strlen(buf), "test"); double nx = fvec_dot(fx, fx); double ny = fvec_dot(fy, fy); err += fabs(fvec_norm2(fx) - sqrt(nx)) > 1e-7; err += fabs(fvec_norm2(fy) - sqrt(ny)) > 1e-7; err += fabs(fvec_dot(fx, fy) > nx + ny); /* Clean up */ fvec_destroy(fx); fvec_destroy(fy); } test_return(err, 3 * i); return err; }
/* * A simple static test for the addition of feature vectors */ int test_static_add() { int i, err = 0; fvec_t *fx, *fy, *fz; test_printf("Addition of feature vectors"); for (i = 0; test_add[i].x; i++) { /* Extract features */ fx = fvec_extract(test_add[i].x, strlen(test_add[i].x), "test"); fy = fvec_extract(test_add[i].y, strlen(test_add[i].y), "test"); /* Add test vectors */ fz = fvec_add(fx, fy); err += fabs(fvec_norm1(fz) - test_add[i].res) > 1e-7; fvec_destroy(fz); fvec_destroy(fx); fvec_destroy(fy); } test_return(err, i); return err; }
int test_pos_ngrams() { int i, err = 0; fvec_t *f; /* Test for positional n-grams */ test_t t[] = { {"b b b b b", 3, 0, 1}, {"b b b b b", 3, 1, 3}, {"b b b b b", 2, 0, 1}, {"b b b b b", 2, 1, 4}, {NULL, 0, 0, 0} }; test_printf("Testing positional n-grams"); /* Hack to set delimiters */ config_set_string(&cfg, "features.granularity", "tokens"); config_set_string(&cfg, "features.token_delim", " "); fvec_delim_set(" "); for (i = 0; t[i].str; i++) { config_set_int(&cfg, "features.ngram_len", t[i].nlen); config_set_bool(&cfg, "features.ngram_pos", t[i].flag); config_set_int(&cfg, "features.pos_shift", 0); /* Extract features */ f = fvec_extract(t[i].str, strlen(t[i].str)); /* Check for correct number of dimensions */ if (f->len != t[i].len) { test_error("(%d) len %d != %d", i, f->len, t[i].len); err++; } fvec_destroy(f); } config_set_bool(&cfg, "features.ngram_pos", 0); test_return(err, i); return err; }
/* * Internal: Allocates and extracts a feature vector from a string * without postprocessing but blended n-grams * @param x String of bytes (with space delimiters) * @param l Length of sequence * @return feature vector */ fvec_t *fvec_extract_intern(char *x, int l) { int blend; cfg_int i, n; /* Get config */ config_lookup_bool(&cfg, "features.ngram_blend", &blend); config_lookup_int(&cfg, "features.ngram_len", &n); /* Extract n-grams */ fvec_t *fv = fvec_extract_intern2(x, l, n); /* Blended n-grams */ for (i = 1; blend && i < n; i++) { fvec_t *fx = fvec_extract_intern2(x, l, i); fvec_add(fv, fx); fvec_destroy(fx); } return fv; }
/** * Clones a feature vector * @param o Feature vector * @return Cloned feature vector */ fvec_t *fvec_clone(fvec_t *o) { assert(o); fvec_t *fv; unsigned int i; /* Allocate feature vector */ fv = calloc(1, sizeof(fvec_t)); if (!fv) { error("Could not clone feature vector"); return NULL; } /* Clone structure */ fv->len = o->len; fv->total = o->total; fv->mem = o->mem; if (o->src) fv->src = strdup(o->src); /* Check for empty sequence */ if (o->len == 0) return fv; fv->dim = (feat_t *) malloc(o->len * sizeof(feat_t)); fv->val = (float *) malloc(o->len * sizeof(float)); if (!fv->dim || !fv->val) { error("Could not allocate feature vector"); fvec_destroy(fv); return NULL; } for (i = 0; i < o->len; i++) { fv->dim[i] = o->dim[i]; fv->val[i] = o->val[i]; } return fv; }
/** * Print shared n-grams for each cluster * @param c Clustering structure * @param fa Array of feature vectors * @param file Output file */ void export_shared_ngrams(cluster_t *c, farray_t *fa, const char *file) { assert(c && fa && file); int i, j, k; double shared; FILE *f; char *name = NULL; config_lookup_float(&cfg, "cluster.shared_ngrams", &shared); if (shared <= 0.0) return; if (verbose > 0) printf("Exporting shared n-grams with minimum ratio %4.2f.\n", shared); if (!(f = fopen(file, "a"))) { error("Could not create file '%s'.", file); return; } /* Print incremental header */ fprintf(f, "# ---\n# Shared n-grams for %s\n", fa->src); fprintf(f, "# Minimum ratio of shared n-grams: %4.2f (%2.0f%%)\n", shared, shared * 100); fprintf(f, "# ---\n# <cluster> <ratio> <hash> <ngram>\n"); /* Compute shared n-grams per cluster */ for (i = 0; i < c->num; i++) { fvec_t *s = fvec_zero(); for (j = 0, k = 0; j < c->len; j++) { if (c->cluster[j] != i) continue; /* Clone and binarize */ fvec_t *x = fvec_clone(fa->x[j]); fvec_bin(x); if (k == 0) name = cluster_get_name(c, j); /* Merge n-grams in cluster */ fvec_t *y = fvec_add(s, x); fvec_destroy(s); fvec_destroy(x); s = y; k++; } /* Check for empty cluster */ if (k == 0) continue; fvec_div(s, k); /* Output shared n-grams */ for (j = 0; j < s->len; j++) { if (s->val[j] < shared) continue; fprintf(f, "%s %6.4f %.16llx ", name, s->val[j], (long long unsigned int) s->dim[j]); /* Lookup feature */ fentry_t *fe = ftable_get(s->dim[j]); if (!fe) error("Oops. Feature not in lookup table."); /* Print feature */ fprintf(f, "\""); for (k = 0; k < fe->len; k++) { if (isprint(fe->data[k]) || fe->data[k] == '%') fprintf(f, "%c", fe->data[k]); else fprintf(f, "%%%.2x", fe->data[k]); } fprintf(f, "\"\n"); } fvec_destroy(s); } fclose(f); }
/** * Allocate and extract a feature vector from a sequence. * There is a global table of delimiter symbols which is only * initialized once the first sequence is processed. * See fvec_reset_delim(); * @param x Sequence of bytes * @param l Length of sequence * @param s Source of features, e.g. file name * @return feature vector */ fvec_t *fvec_extract(char *x, int l, char *s) { fvec_t *fv; int nlen; const char *dlm_str, *cfg_str; assert(x && l >= 0); /* Allocate feature vector */ fv = calloc(1, sizeof(fvec_t)); if (!fv) { error("Could not extract feature vector"); return NULL; } /* Initialize feature vector */ fv->len = 0; fv->total = 0; fv->dim = (feat_t *) malloc(l * sizeof(feat_t)); fv->val = (float *) malloc(l * sizeof(float)); fv->mem = sizeof(fvec_t); /* Set source */ if (s) { fv->src = strdup(s); fv->mem += strlen(s); } /* Check for empty sequence */ if (l == 0) return fv; if (!fv->dim || !fv->val) { error("Could not allocate feature vector"); fvec_destroy(fv); return NULL; } /* Get n-gram length */ config_lookup_int(&cfg, "features.ngram_len", (int *) &nlen); /* Construct delimiter lookup table */ config_lookup_string(&cfg, "features.ngram_delim", &dlm_str); /* N-grams of bytes */ if (!dlm_str || strlen(dlm_str) == 0) { /* Feature extraction */ extract_ngrams(fv, x, l, nlen); } else { if (delim[0] == DELIM_NOT_INIT) { memset(delim, 0, 256); decode_delim(dlm_str); } /* Feature extraction */ extract_wgrams(fv, x, l, nlen); } fv->total = fv->len; /* Sort extracted features */ qsort(fv->dim, fv->len, sizeof(feat_t), cmp_feat); /* Compute embedding and condense */ config_lookup_string(&cfg, "features.vect_embed", &cfg_str); if (!strcasecmp(cfg_str, "cnt")) { fvec_condense(fv, EMBED_CNT); } else if (!strcasecmp(cfg_str, "bin")) { fvec_condense(fv, EMBED_BIN); } else { warning("Unknown embedding '%s', using 'cnt'.", cfg_str); fvec_condense(fv, EMBED_CNT); } /* Compute l2 normalization */ fvec_normalize(fv, NORM_L2); return fv; }