Ejemplo n.º 1
0
Archivo: fvec.c Proyecto: MLDroid/sally
/**
 * Internal: Allocates and extracts a feature vector from a string without
 * postprocessing and no blended n-grams.
 * @param x String of bytes (with space delimiters)
 * @param l Length of sequence
 * @param n N-gram length
 * @return feature vector
 */
fvec_t *fvec_extract_intern2(char *x, int l, int n)
{
    fvec_t *fv;
    int pos;
    cfg_int shift;
    const char *dlm_str;
    assert(x && l >= 0);

    /* Allocate feature vector */
    fv = calloc(1, sizeof(fvec_t));
    if (!fv) {
        error("Could not extract feature vector");
        return NULL;
    }

    /* Get configuration */
    config_lookup_string(&cfg, "features.ngram_delim", &dlm_str);
    config_lookup_bool(&cfg, "features.ngram_pos", &pos);
    config_lookup_int(&cfg, "features.pos_shift", &shift);

    /* Check for empty sequence */
    if (l == 0)
        return fv;

    /* Sanitize shift value */
    if (!pos)
        shift = 0;

    /* Allocate arrays */
    int space = 2 * shift + 1;
    fv->dim = (feat_t *) malloc(l * sizeof(feat_t) * space);
    fv->val = (float *) malloc(l * sizeof(float) * space);

    if (!fv->dim || !fv->val) {
        error("Could not allocate feature vector contents");
        fvec_destroy(fv);
        return NULL;
    }

    /* Get configuration */
    config_lookup_string(&cfg, "features.ngram_delim", &dlm_str);

    /* Loop over position shifts (0 if pos is disabled) */
    for (int s = -shift; s <= shift; s++) {
        if (!dlm_str || strlen(dlm_str) == 0) {
            extract_ngrams(fv, x, l, n, pos, s);
        } else {
            extract_wgrams(fv, x, l, n, pos, s);
        }
    }

    /* Sort extracted features */
    qsort(fv->dim, fv->len, sizeof(feat_t), cmp_feat);

    /* Count features  */
    count_feat(fv);

    return fv;
}
Ejemplo n.º 2
0
/**
 * Allocate and extract a feature vector from a sequence.
 * There is a global table of delimiter symbols which is only 
 * initialized once the first sequence is processed. 
 * See fvec_reset_delim();
 * @param x Sequence of bytes 
 * @param l Length of sequence
 * @param s Source of features, e.g. file name
 * @return feature vector
 */
fvec_t *fvec_extract(char *x, int l, char *s)
{
    fvec_t *fv;
    int nlen;
    const char *dlm_str, *cfg_str;
    assert(x && l >= 0);

    /* Allocate feature vector */
    fv = calloc(1, sizeof(fvec_t));
    if (!fv) {
        error("Could not extract feature vector");
        return NULL;
    }

    /* Initialize feature vector */
    fv->len = 0;
    fv->total = 0;
    fv->dim = (feat_t *) malloc(l * sizeof(feat_t));
    fv->val = (float *) malloc(l * sizeof(float));
    fv->mem = sizeof(fvec_t);

    /* Set source */
    if (s) {
        fv->src = strdup(s);
        fv->mem += strlen(s);
    }

    /* Check for empty sequence */
    if (l == 0)
        return fv;

    if (!fv->dim || !fv->val) {
        error("Could not allocate feature vector");
        fvec_destroy(fv);
        return NULL;
    }

    /* Get n-gram length */
    config_lookup_int(&cfg, "features.ngram_len", (int *) &nlen);

    /* Construct delimiter lookup table */
    config_lookup_string(&cfg, "features.ngram_delim", &dlm_str);

    /* N-grams of bytes */
    if (!dlm_str || strlen(dlm_str) == 0) {
        /* Feature extraction */
        extract_ngrams(fv, x, l, nlen);
    } else {
        if (delim[0] == DELIM_NOT_INIT) {
            memset(delim, 0, 256);
            decode_delim(dlm_str);
        }

        /* Feature extraction */
        extract_wgrams(fv, x, l, nlen);
    }
    fv->total = fv->len;

    /* Sort extracted features */
    qsort(fv->dim, fv->len, sizeof(feat_t), cmp_feat);

    /* Compute embedding and condense */
    config_lookup_string(&cfg, "features.vect_embed", &cfg_str);
    if (!strcasecmp(cfg_str, "cnt")) {
        fvec_condense(fv, EMBED_CNT);
    } else if (!strcasecmp(cfg_str, "bin")) {
        fvec_condense(fv, EMBED_BIN);
    } else {
        warning("Unknown embedding '%s', using 'cnt'.", cfg_str);
        fvec_condense(fv, EMBED_CNT);
    }

    /* Compute l2 normalization */
    fvec_normalize(fv, NORM_L2);
    return fv;
}