/** * Internal: Allocates and extracts a feature vector from a string without * postprocessing and no blended n-grams. * @param x String of bytes (with space delimiters) * @param l Length of sequence * @param n N-gram length * @return feature vector */ fvec_t *fvec_extract_intern2(char *x, int l, int n) { fvec_t *fv; int pos; cfg_int shift; const char *dlm_str; assert(x && l >= 0); /* Allocate feature vector */ fv = calloc(1, sizeof(fvec_t)); if (!fv) { error("Could not extract feature vector"); return NULL; } /* Get configuration */ config_lookup_string(&cfg, "features.ngram_delim", &dlm_str); config_lookup_bool(&cfg, "features.ngram_pos", &pos); config_lookup_int(&cfg, "features.pos_shift", &shift); /* Check for empty sequence */ if (l == 0) return fv; /* Sanitize shift value */ if (!pos) shift = 0; /* Allocate arrays */ int space = 2 * shift + 1; fv->dim = (feat_t *) malloc(l * sizeof(feat_t) * space); fv->val = (float *) malloc(l * sizeof(float) * space); if (!fv->dim || !fv->val) { error("Could not allocate feature vector contents"); fvec_destroy(fv); return NULL; } /* Get configuration */ config_lookup_string(&cfg, "features.ngram_delim", &dlm_str); /* Loop over position shifts (0 if pos is disabled) */ for (int s = -shift; s <= shift; s++) { if (!dlm_str || strlen(dlm_str) == 0) { extract_ngrams(fv, x, l, n, pos, s); } else { extract_wgrams(fv, x, l, n, pos, s); } } /* Sort extracted features */ qsort(fv->dim, fv->len, sizeof(feat_t), cmp_feat); /* Count features */ count_feat(fv); return fv; }
/** * Allocate and extract a feature vector from a sequence. * There is a global table of delimiter symbols which is only * initialized once the first sequence is processed. * See fvec_reset_delim(); * @param x Sequence of bytes * @param l Length of sequence * @param s Source of features, e.g. file name * @return feature vector */ fvec_t *fvec_extract(char *x, int l, char *s) { fvec_t *fv; int nlen; const char *dlm_str, *cfg_str; assert(x && l >= 0); /* Allocate feature vector */ fv = calloc(1, sizeof(fvec_t)); if (!fv) { error("Could not extract feature vector"); return NULL; } /* Initialize feature vector */ fv->len = 0; fv->total = 0; fv->dim = (feat_t *) malloc(l * sizeof(feat_t)); fv->val = (float *) malloc(l * sizeof(float)); fv->mem = sizeof(fvec_t); /* Set source */ if (s) { fv->src = strdup(s); fv->mem += strlen(s); } /* Check for empty sequence */ if (l == 0) return fv; if (!fv->dim || !fv->val) { error("Could not allocate feature vector"); fvec_destroy(fv); return NULL; } /* Get n-gram length */ config_lookup_int(&cfg, "features.ngram_len", (int *) &nlen); /* Construct delimiter lookup table */ config_lookup_string(&cfg, "features.ngram_delim", &dlm_str); /* N-grams of bytes */ if (!dlm_str || strlen(dlm_str) == 0) { /* Feature extraction */ extract_ngrams(fv, x, l, nlen); } else { if (delim[0] == DELIM_NOT_INIT) { memset(delim, 0, 256); decode_delim(dlm_str); } /* Feature extraction */ extract_wgrams(fv, x, l, nlen); } fv->total = fv->len; /* Sort extracted features */ qsort(fv->dim, fv->len, sizeof(feat_t), cmp_feat); /* Compute embedding and condense */ config_lookup_string(&cfg, "features.vect_embed", &cfg_str); if (!strcasecmp(cfg_str, "cnt")) { fvec_condense(fv, EMBED_CNT); } else if (!strcasecmp(cfg_str, "bin")) { fvec_condense(fv, EMBED_BIN); } else { warning("Unknown embedding '%s', using 'cnt'.", cfg_str); fvec_condense(fv, EMBED_CNT); } /* Compute l2 normalization */ fvec_normalize(fv, NORM_L2); return fv; }