Exemple #1
0
/**
 * Main processing routine of Sally. This function processes chunks of
 * strings. It might be suitable for OpenMP support in a later version.
 */
static void sally_process()
{
    long read, i, j;
    int chunk;
    const char *hash_file;

    /* Check if a hash file is set */
    config_lookup_string(&cfg, "features.hash_file", &hash_file);

    /* Get chunk size */
    config_lookup_int(&cfg, "input.chunk_size", &chunk);

    /* Allocate space */
    fvec_t **fvec = malloc(sizeof(fvec_t *) * chunk);
    string_t *strs = malloc(sizeof(string_t) * chunk);

    if (!fvec || !strs)
        fatal("Could not allocate memory for embedding");

    info_msg(1, "Processing %d strings in chunks of %d.", entries, chunk);

    for (i = 0, read = 0; i < entries; i += read) {
        read = input_read(strs, chunk);
        if (read <= 0)
            fatal("Failed to read strings from input '%s'", input);

        /* Generic preprocessing of input */
        input_preproc(strs, read);

#ifdef ENABLE_OPENMP
#pragma omp parallel for
#endif
        for (j = 0; j < read; j++) {
            fvec[j] = fvec_extract(strs[j].str, strs[j].len);
            fvec_set_label(fvec[j], strs[j].label);
            fvec_set_source(fvec[j], strs[j].src);
        }

        if (!output_write(fvec, read))
            fatal("Failed to write vectors to output '%s'", output);

        /* Free memory */
        input_free(strs, read);
        output_free(fvec, read);

        /* Reset hash if enabled but no hash file is set */
        if (fhash_enabled() && !strlen(hash_file) > 0)
            fhash_reset();

        prog_bar(0, entries, i + read);
    }
    
    free(fvec);
    free(strs);
}
Exemple #2
0
/**
 * Print the content of a feature vector
 * @param f File pointer
 * @param fv feature vector
 */
void fvec_print(FILE * f, fvec_t *fv)
{
    assert(fv);
    int i, j;

    fprintf(f, "Feature vector [src: %s, label: %g, len: %lu, total: %lu]\n",
            fv->src, fv->label, fv->len, fv->total);

    for (i = 0; i < fv->len; i++) {
        fprintf(f, "   %.16llx:%6.4f [", (long long unsigned int) fv->dim[i],
                fv->val[i]);

        if (fhash_enabled()) {
            fentry_t *fe = fhash_get(fv->dim[i]);
            for (j = 0; fe && j < fe->len; j++)
                if (isprint(fe->data[j]) && !strchr("% ", fe->data[j]))
                    fprintf(f, "%c", fe->data[j]);
                else
                    fprintf(f, "%%%.2x", (unsigned char) fe->data[j]);
        }

        fprintf(f, "]\n");
    }
}
Exemple #3
0
/**
 * Extract byte n-grams from a string. The features (n-grams) are 
 * represented by hash values.
 * @param fv Feature vector
 * @param x Byte sequence 
 * @param l Length of sequence
 * @param nlen N-gram length
 * @param pos Positional n-grams 
 * @param shift Shift value
 */
static void extract_ngrams(fvec_t *fv, char *x, int l, int nlen, int pos,
                           int shift)
{
    assert(fv && x);

    unsigned int i = 0, ci = 0;
    int sort, flen, sign;
    cfg_int bits;
    char *fstr, *t = x;
    fentry_t *cache = NULL;

    /* Get configuration */
    config_lookup_bool(&cfg, "features.ngram_sort", &sort);
    config_lookup_int(&cfg, "features.hash_bits", &bits);
    config_lookup_bool(&cfg, "features.vect_sign", &sign);

    /* Set bits of hash mask */
    feat_t hash_mask = ((long long unsigned) 2 << (bits - 1)) - 1;

    if (fhash_enabled())
        cache = calloc(l, sizeof(fentry_t));

    for (i = 1; t < x + l; i++) {
        /* Check for sequence end */
        if (t + nlen > x + l)
            break;

        /* Copy feature string and add slack */
        flen = nlen;
        fstr = malloc(flen + sizeof(unsigned long));
        memcpy(fstr, t, nlen);

        /* Sorted n-grams code */
        if (sort)
            qsort(fstr, flen, 1, chrcmp);

        /* Positional n-grams code */
        if (pos) {
            int32_t p = ci + shift;
            memcpy(fstr + flen, &p, sizeof(int32_t));
            flen += sizeof(int32_t);
        }

        feat_t h = hash_str(fstr, flen);
        fv->dim[fv->len] = h & hash_mask;
        fv->val[fv->len] = 1;

        /* Signed embedding */
        if (sign)
            fv->val[fv->len] *= (signed) h > 0 ? -1 : 1;

        /* Cache feature */
        if (fhash_enabled())
            cache_put(&cache[ci], fv, fstr, flen);

        t++;
        fv->len++;
        ci++;
        free(fstr);
    }
    fv->total += fv->len;

    if (!fhash_enabled())
        return;

    /* Flush cache */
    cache_flush(cache, ci);
    free(cache);
}
Exemple #4
0
/**
 * Extracts word n-grams from a string. The features are represented 
 * by hash values.
 * @param fv Feature vector
 * @param x Byte sequence 
 * @param l Length of sequence
 * @parma nlen N-gram len
 * @param pos Positional n-grams
 * @param shift Shift value
 */
static void extract_wgrams(fvec_t *fv, char *x, int l, int nlen, int pos,
                           int shift)
{
    assert(fv && x && l > 0);
    int sort, sign, flen;
    cfg_int bits;
    unsigned int i, j = l, ci = 0;
    unsigned int dlm = 0;
    unsigned int fstart, fnext = 0, fnum = 0;
    char *t = malloc(l + 1), *fstr;
    fentry_t *cache = NULL;

    /* Get configuration */
    config_lookup_bool(&cfg, "features.ngram_sort", &sort);
    config_lookup_int(&cfg, "features.hash_bits", &bits);
    config_lookup_bool(&cfg, "features.vect_sign", &sign);

    /* Set bits of hash mask */
    feat_t hash_mask = ((long long unsigned) 2 << (bits - 1)) - 1;

    if (fhash_enabled())
        cache = calloc(l, sizeof(fentry_t));

    /* Find first delimiter symbol */
    for (dlm = 0; !delim[(unsigned char) dlm] && dlm < 256; dlm++);

    /* Remove redundant delimiters */
    for (i = 0, j = 0; i < l; i++) {
        if (delim[(unsigned char) x[i]]) {
            if (j == 0 || delim[(unsigned char) t[j - 1]])
                continue;
            t[j++] = (char) dlm;
        } else {
            t[j++] = x[i];
        }
    }

    /* No characters remaining */
    if (j == 0)
        goto clean;

    /* Add trailing delimiter */
    if (t[j - 1] != dlm)
        t[j++] = (char) dlm;

    /* Extract n-grams */
    for (fstart = i = 0; i < j; i++) {
        /* Count delimiters and remember start position */
        if (t[i] == dlm && ++fnum == 1)
            fnext = i;

        /* Store n-gram */
        if (fnum == nlen && i - fstart > 0) {
            /* Copy feature string and add slack */
            flen = i - fstart;
            fstr = malloc(flen + sizeof(unsigned long));
            memcpy(fstr, t + fstart, flen);

            /* Sorted n-grams code */
            if (sort)
                fstr = sort_words(fstr, flen, dlm);

            /* Positional n-grams code */
            if (pos) {
                int32_t p = ci + shift;
                memcpy(fstr + flen, &p, sizeof(int32_t));
                flen += sizeof(int32_t);
            }

            feat_t h = hash_str(fstr, flen);
            fv->dim[fv->len] = h & hash_mask;
            fv->val[fv->len] = 1;

            /* Signed embedding */
            if (sign)
                fv->val[fv->len] *= (signed) h > 0 ? -1 : 1;

            /* Cache feature and key */
            if (fhash_enabled())
                cache_put(&cache[ci], fv, fstr, flen);

            fstart = fnext + 1, i = fnext, fnum = 0;
            fv->len++;
            ci++;
            free(fstr);
        }
    }

    /* Save extracted n-grams */
    fv->total += fv->len;

  clean:
    if (fhash_enabled()) {
        cache_flush(cache, ci);
        free(cache);
    }
    free(t);
}