/* * A simple load and save test case */ int test_load_save() { int i, j, err = 0; fvec_t *f, *g; gzFile z; test_printf("Loading and saving of feature vectors"); fvec_reset_delim(); config_set_string(&cfg, "generic.event_delim", " "); config_set_int(&cfg, "features.ngram_len", 2); /* Create and save feature vectors */ z = gzopen(TEST_FILE, "wb9"); if (!z) { printf("Could not create file (ignoring)\n"); return FALSE; } for (i = 0; tests[i].str; i++) { f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); fvec_save(f, z); fvec_destroy(f); } gzclose(z); /* Load and compare feature vectors */ z = gzopen(TEST_FILE, "r"); for (i = 0; tests[i].str; i++) { f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); g = fvec_load(z); /* Check dimensions and values */ for (j = 0; j < f->len && j < g->len; j++) { if (f->dim[j] != g->dim[j]) { test_error("(%d) f->dim[%d] != g->dim[%d]", i, j, j); break; } if (fabs(f->val[j] - g->val[j]) > 10e-10) { test_error("(%d) f->val[%d] != g->val[%d]", i, j, j); break; } } err += (j < f->len || j < g->len); fvec_destroy(f); fvec_destroy(g); } gzclose(z); unlink(TEST_FILE); test_return(err, i); return err; }
/* * A simple test for the binary embedding */ int test_embed_bin() { int i, j, n, err = 0; string_t strs[10]; input_config("lines"); char *test_file = getenv("TEST_FILE"); n = input_open(test_file); input_read(strs, n); test_printf("Testing binary embedding"); config_set_string(&cfg, "features.vect_embed", "bin"); config_set_string(&cfg, "features.vect_norm", "none"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); double n = 0; for (j = 0; j < fv->len; j++) n += fv->val[j]; err += fabs(n - fv->len) > 1e-6; fvec_destroy(fv); } test_return(err, n); input_free(strs, n); input_close(); return err; }
/* * A simple static test for the feature vectors */ int test_static() { int i, err = 0; fvec_t *f; test_printf("Extraction of feature vectors"); for (i = 0; tests[i].str; i++) { fvec_reset_delim(); config_set_string(&cfg, "features.ngram_delim", tests[i].dlm); config_set_int(&cfg, "features.ngram_len", tests[i].nlen); /* Extract features */ f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); /* Check for correct number of dimensions */ if (f->len != tests[i].len) { test_error("(%d) len %d != %d", i, f->len, tests[i].len); err++; } fvec_destroy(f); } test_return(err, i); return err; }
/* * A simple stress test for the feature table */ int test_stress() { int i, j, err = 0; fvec_t *f; char buf[STR_LENGTH + 1]; test_printf("Stress test for feature vectors"); config_set_string(&cfg, "features.ngram_delim", "0"); ftable_init(); for (i = 0; i < STRESS_RUNS; i++) { config_set_int(&cfg, "features.ngram_len", rand() % 10 + 1); /* Create random key and string */ for (j = 0; j < STR_LENGTH; j++) buf[j] = rand() % 10 + '0'; buf[j] = 0; /* Extract features */ f = fvec_extract(buf, strlen(buf), "test"); /* Destroy features */ fvec_destroy(f); } ftable_destroy(); test_return(err, STRESS_RUNS); return err; }
/** * Test clustering */ int test_cluster_single() { int i, j, k, err = 0; test_printf("Clustering using prototypes (single)"); /* Prepare test data */ ; farray_t *fa = farray_create("test"); for (i = 0; i < DATA_LEN; i++) { fvec_t *f = fvec_extract(test_data[i], strlen(test_data[i]), NULL); farray_add(fa, f, "test"); } /* Get clustering */ config_set_string(&cfg, "cluster.link_mode", "single"); cluster_t *c = cluster_linkage(fa, 0); /* Check number of clusters */ err += (c->num != DATA_CLUSTER); /* Check position of prototypes */ for (k = 0; k < DATA_LEN; k += DATA_LEN / DATA_CLUSTER) for (j = 0; j < DATA_LEN / DATA_CLUSTER - 1; j++) err += c->cluster[k + j] != c->cluster[k + j + 1]; /* Clean up */ cluster_destroy(c); farray_destroy(fa); test_return(err, 1 + DATA_CLUSTER * (DATA_LEN / DATA_CLUSTER - 1)); return err; }
/* * A simple stress test for classification */ int test_stress() { int i, j, k, err = 0; fvec_t *f; farray_t *fa; char buf[STR_LENGTH + 1], label[32]; test_printf("Stress test for classification"); for (i = 0; i < STRESS_RUNS; i++) { /* Create array */ fa = farray_create("test"); for (j = 0; j < NUM_VECTORS; j++) { for (k = 0; k < STR_LENGTH; k++) buf[k] = rand() % 10 + '0'; buf[k] = 0; /* Extract features */ f = fvec_extract(buf, strlen(buf), "test"); snprintf(label, 32, "label%.2d", rand() % 10); /* Add to array */ farray_add(fa, f, label); } assign_t *a = class_assign(fa, fa); assign_destroy(a); farray_destroy(fa); } test_return(err, STRESS_RUNS); return err; }
/* * A simple test for the l2 norm */ int test_norm_l2() { int i, j, n, err = 0; string_t strs[10]; input_config("lines"); char *test_file = getenv("TEST_FILE"); n = input_open(test_file); input_read(strs, n); test_printf("Testing L2 normalization"); config_set_string(&cfg, "features.vect_norm", "l2"); for (i = 0, err = 0; i < n; i++) { fvec_t *fv = fvec_extract(strs[i].str, strs[i].len); double n = 0; for (j = 0; j < fv->len; j++) n += fv->val[j] * fv->val[j]; err += fabs(sqrt(n) - 1.0) > 1e-6; fvec_destroy(fv); } test_return(err, n); input_free(strs, n); input_close(); return err; }
/** * Main processing routine of Sally. This function processes chunks of * strings. It might be suitable for OpenMP support in a later version. */ static void sally_process() { long read, i, j; int chunk; const char *hash_file; /* Check if a hash file is set */ config_lookup_string(&cfg, "features.hash_file", &hash_file); /* Get chunk size */ config_lookup_int(&cfg, "input.chunk_size", &chunk); /* Allocate space */ fvec_t **fvec = malloc(sizeof(fvec_t *) * chunk); string_t *strs = malloc(sizeof(string_t) * chunk); if (!fvec || !strs) fatal("Could not allocate memory for embedding"); info_msg(1, "Processing %d strings in chunks of %d.", entries, chunk); for (i = 0, read = 0; i < entries; i += read) { read = input_read(strs, chunk); if (read <= 0) fatal("Failed to read strings from input '%s'", input); /* Generic preprocessing of input */ input_preproc(strs, read); #ifdef ENABLE_OPENMP #pragma omp parallel for #endif for (j = 0; j < read; j++) { fvec[j] = fvec_extract(strs[j].str, strs[j].len); fvec_set_label(fvec[j], strs[j].label); fvec_set_source(fvec[j], strs[j].src); } if (!output_write(fvec, read)) fatal("Failed to write vectors to output '%s'", output); /* Free memory */ input_free(strs, read); output_free(fvec, read); /* Reset hash if enabled but no hash file is set */ if (fhash_enabled() && !strlen(hash_file) > 0) fhash_reset(); prog_bar(0, entries, i + read); } free(fvec); free(strs); }
/* * A stres test for the addition of feature vectors */ int test_stress_add() { int i, j, err = 0; fvec_t *fx, *fy, *fz; char buf[STR_LENGTH + 1]; test_printf("Stress test for addition of feature vectors"); /* Create empty vector */ fz = fvec_extract("aa0bb0cc", 8, "zero"); for (i = 0; i < NUM_VECTORS; i++) { /* Create random key and string */ for (j = 0; j < STR_LENGTH; j++) buf[j] = rand() % 10 + '0'; buf[j] = 0; /* Extract features */ fx = fvec_extract(buf, strlen(buf), "test"); /* Add fx to fz */ fy = fvec_add(fz, fx); fvec_destroy(fz); err += fabs(fvec_norm2(fy) - 1.4142135623) > 1e-7; /* Substract fx from fz */ fz = fvec_sub(fy, fx); fvec_sparsify(fz); /* Clean up */ fvec_destroy(fy); fvec_destroy(fx); } fvec_destroy(fz); test_return(err, i); return err; }
/** * Simple test cases classification */ int test_classify() { int i, k, err = 0; fvec_t *f; test_printf("Classification using prototypes"); /* Prepare training data */ farray_t *fa1 = farray_create("train"); for (i = 0; train_data[i].str; i++) { f = fvec_extract(train_data[i].str, strlen(train_data[i].str), NULL); farray_add(fa1, f, train_data[i].label); } /* Prepare testing data */ farray_t *fa2 = farray_create("train"); for (i = 0; test_data[i].str; i++) { f = fvec_extract(test_data[i].str, strlen(test_data[i].str), NULL); farray_add(fa2, f, test_data[i].label); } /* Classification of test data */ config_set_float(&cfg, "classify.max_dist", 1.41); assign_t *a = class_assign(fa2, fa1); /* Check predicted labels */ for (k = 0; test_data[k].str; k++) { char *l = farray_get_label(fa1, a->proto[k]); err += strcmp(l, test_data[k].label) != 0; } /* Clean up */ assign_destroy(a); farray_destroy(fa1); farray_destroy(fa2); test_return(err, i); return err; }
/* * A simple static test for the dot-product of feature vectors */ int test_static_dot() { int i, err = 0; fvec_t *fx, *fy; test_printf("Dot product of feature vectors"); for (i = 0; test_dot[i].x; i++) { /* Extract features */ fx = fvec_extract(test_dot[i].x, strlen(test_dot[i].x), "test"); fy = fvec_extract(test_dot[i].y, strlen(test_dot[i].y), "test"); /* Compute dot product */ double d = fvec_dot(fx, fy); err += fabs(d - test_dot[i].res) > 1e-6; fvec_destroy(fx); fvec_destroy(fy); } test_return(err, i); return err; }
/* * A stres test for the addition of feature vectors */ int test_stress_dot() { int i, j, err = 0; fvec_t *fx, *fy; char buf[STR_LENGTH + 1]; test_printf("Stress test for dot product of feature vectors"); /* Create empty vector */ for (i = 0; i < NUM_VECTORS; i++) { /* Create random key and string */ for (j = 0; j < STR_LENGTH; j++) buf[j] = rand() % 10 + '0'; buf[j] = 0; fx = fvec_extract(buf, strlen(buf), "test"); /* Create random key and string */ for (j = 0; j < STR_LENGTH; j++) buf[j] = rand() % 10 + '0'; buf[j] = 0; fy = fvec_extract(buf, strlen(buf), "test"); double nx = fvec_dot(fx, fx); double ny = fvec_dot(fy, fy); err += fabs(fvec_norm2(fx) - sqrt(nx)) > 1e-7; err += fabs(fvec_norm2(fy) - sqrt(ny)) > 1e-7; err += fabs(fvec_dot(fx, fy) > nx + ny); /* Clean up */ fvec_destroy(fx); fvec_destroy(fy); } test_return(err, 3 * i); return err; }
/* * A simple static test for the addition of feature vectors */ int test_static_add() { int i, err = 0; fvec_t *fx, *fy, *fz; test_printf("Addition of feature vectors"); for (i = 0; test_add[i].x; i++) { /* Extract features */ fx = fvec_extract(test_add[i].x, strlen(test_add[i].x), "test"); fy = fvec_extract(test_add[i].y, strlen(test_add[i].y), "test"); /* Add test vectors */ fz = fvec_add(fx, fy); err += fabs(fvec_norm1(fz) - test_add[i].res) > 1e-7; fvec_destroy(fz); fvec_destroy(fx); fvec_destroy(fy); } test_return(err, i); return err; }
int test_pos_ngrams() { int i, err = 0; fvec_t *f; /* Test for positional n-grams */ test_t t[] = { {"b b b b b", 3, 0, 1}, {"b b b b b", 3, 1, 3}, {"b b b b b", 2, 0, 1}, {"b b b b b", 2, 1, 4}, {NULL, 0, 0, 0} }; test_printf("Testing positional n-grams"); /* Hack to set delimiters */ config_set_string(&cfg, "features.granularity", "tokens"); config_set_string(&cfg, "features.token_delim", " "); fvec_delim_set(" "); for (i = 0; t[i].str; i++) { config_set_int(&cfg, "features.ngram_len", t[i].nlen); config_set_bool(&cfg, "features.ngram_pos", t[i].flag); config_set_int(&cfg, "features.pos_shift", 0); /* Extract features */ f = fvec_extract(t[i].str, strlen(t[i].str)); /* Check for correct number of dimensions */ if (f->len != t[i].len) { test_error("(%d) len %d != %d", i, f->len, t[i].len); err++; } fvec_destroy(f); } config_set_bool(&cfg, "features.ngram_pos", 0); test_return(err, i); return err; }
/* * A simple stress test for clustering */ int test_stress() { int i, j, k, err = 0; fvec_t *f; farray_t *fa; char buf[STR_LENGTH + 1], label[32]; test_printf("Stress test for clustering"); for (i = 0; i < STRESS_RUNS; i++) { /* Create array */ fa = farray_create("test"); for (j = 0; j < NUM_VECTORS; j++) { for (k = 0; k < STR_LENGTH; k++) buf[k] = rand() % 10 + '0'; buf[k] = 0; /* Extract features */ f = fvec_extract(buf, strlen(buf), "test"); snprintf(label, 32, "label%.2d", rand() % 10); /* Add to array */ farray_add(fa, f, label); } /* Extract prototypes */ cluster_t *c = cluster_linkage(fa, 0); /* Destroy features */ cluster_destroy(c); farray_destroy(fa); } test_return(err, STRESS_RUNS); return err; }
/** * Allocate an empty feature vector * @return feature vector */ fvec_t *fvec_zero() { return fvec_extract("", 0, "zero"); }
/** * Allocates and extracts an empty feature vector * @return feature vector */ fvec_t *fvec_zero() { return fvec_extract("", 0); }