Esempio n. 1
0
/**
 * Print the content of a feature vector
 * @param fv feature vector
 */
void fvec_print(fvec_t *fv)
{
    assert(fv);
    int i, j;

    printf("feature vector\n  len: %lu, total: %lu, mem: %.2fkb\n",
           fv->len, fv->total, fv->mem / 1e3);

    if (fv->src)
        printf("  src: '%s'\n", fv->src);

    if (verbose < 3)
        return;

    for (i = 0; i < fv->len; i++) {
        printf("    0x%.16llx: %6.4f", (long long unsigned int) fv->dim[i],
               fv->val[i]);

        /* Lookup feature */
        fentry_t *fe = ftable_get(fv->dim[i]);
        if (!fe) {
            printf("\n");
            continue;
        }

        /* Print feature string */
        printf(" [");
        for (j = 0; j < fe->len; j++) {
            if (isprint(fe->data[j]) || fe->data[j] == '%')
                printf("%c", fe->data[j]);
            else
                printf("%%%.2x", fe->data[j]);
        }
        printf("]\n");
    }
}
Esempio n. 2
0
/**
 * Print shared n-grams for each cluster
 * @param c Clustering structure
 * @param fa Array of feature vectors
 * @param file Output file
 */
void export_shared_ngrams(cluster_t *c, farray_t *fa, const char *file)
{
    assert(c && fa && file);
    int i, j, k;
    double shared;
    FILE *f;
    char *name = NULL;

    config_lookup_float(&cfg, "cluster.shared_ngrams", &shared);
    if (shared <= 0.0)
        return;

    if (verbose > 0)
        printf("Exporting shared n-grams with minimum ratio %4.2f.\n",
               shared);

    if (!(f = fopen(file, "a"))) {
        error("Could not create file '%s'.", file);
        return;
    }

    /* Print incremental header */
    fprintf(f, "# ---\n# Shared n-grams for %s\n", fa->src);
    fprintf(f, "# Minimum ratio of shared n-grams: %4.2f (%2.0f%%)\n",
            shared, shared * 100);
    fprintf(f, "# ---\n# <cluster> <ratio> <hash> <ngram>\n");

    /* Compute shared n-grams per cluster */
    for (i = 0; i < c->num; i++) {
        fvec_t *s = fvec_zero();

        for (j = 0, k = 0; j < c->len; j++) {
            if (c->cluster[j] != i)
                continue;

            /* Clone and binarize */
            fvec_t *x = fvec_clone(fa->x[j]);
            fvec_bin(x);

            if (k == 0)
                name = cluster_get_name(c, j);

            /* Merge n-grams in cluster */
            fvec_t *y = fvec_add(s, x);
            fvec_destroy(s);
            fvec_destroy(x);
            s = y;
            k++;
        }

        /* Check for empty cluster */
        if (k == 0)
            continue;

        fvec_div(s, k);

        /* Output shared n-grams */
        for (j = 0; j < s->len; j++) {
            if (s->val[j] < shared)
                continue;

            fprintf(f, "%s %6.4f %.16llx ", name, s->val[j],
                    (long long unsigned int) s->dim[j]);

            /* Lookup feature */
            fentry_t *fe = ftable_get(s->dim[j]);
            if (!fe)
                error("Oops. Feature not in lookup table.");

            /* Print feature */
            fprintf(f, "\"");
            for (k = 0; k < fe->len; k++) {
                if (isprint(fe->data[k]) || fe->data[k] == '%')
                    fprintf(f, "%c", fe->data[k]);
                else
                    fprintf(f, "%%%.2x", fe->data[k]);
            }
            fprintf(f, "\"\n");
        }
        fvec_destroy(s);
    }

    fclose(f);
}