/** * Main processing routine of Sally. This function processes chunks of * strings. It might be suitable for OpenMP support in a later version. */ static void sally_process() { long read, i, j; int chunk; const char *hash_file; /* Check if a hash file is set */ config_lookup_string(&cfg, "features.hash_file", &hash_file); /* Get chunk size */ config_lookup_int(&cfg, "input.chunk_size", &chunk); /* Allocate space */ fvec_t **fvec = malloc(sizeof(fvec_t *) * chunk); string_t *strs = malloc(sizeof(string_t) * chunk); if (!fvec || !strs) fatal("Could not allocate memory for embedding"); info_msg(1, "Processing %d strings in chunks of %d.", entries, chunk); for (i = 0, read = 0; i < entries; i += read) { read = input_read(strs, chunk); if (read <= 0) fatal("Failed to read strings from input '%s'", input); /* Generic preprocessing of input */ input_preproc(strs, read); #ifdef ENABLE_OPENMP #pragma omp parallel for #endif for (j = 0; j < read; j++) { fvec[j] = fvec_extract(strs[j].str, strs[j].len); fvec_set_label(fvec[j], strs[j].label); fvec_set_source(fvec[j], strs[j].src); } if (!output_write(fvec, read)) fatal("Failed to write vectors to output '%s'", output); /* Free memory */ input_free(strs, read); output_free(fvec, read); /* Reset hash if enabled but no hash file is set */ if (fhash_enabled() && !strlen(hash_file) > 0) fhash_reset(); prog_bar(0, entries, i + read); } free(fvec); free(strs); }
/** * Print the content of a feature vector * @param f File pointer * @param fv feature vector */ void fvec_print(FILE * f, fvec_t *fv) { assert(fv); int i, j; fprintf(f, "Feature vector [src: %s, label: %g, len: %lu, total: %lu]\n", fv->src, fv->label, fv->len, fv->total); for (i = 0; i < fv->len; i++) { fprintf(f, " %.16llx:%6.4f [", (long long unsigned int) fv->dim[i], fv->val[i]); if (fhash_enabled()) { fentry_t *fe = fhash_get(fv->dim[i]); for (j = 0; fe && j < fe->len; j++) if (isprint(fe->data[j]) && !strchr("% ", fe->data[j])) fprintf(f, "%c", fe->data[j]); else fprintf(f, "%%%.2x", (unsigned char) fe->data[j]); } fprintf(f, "]\n"); } }
/** * Extract byte n-grams from a string. The features (n-grams) are * represented by hash values. * @param fv Feature vector * @param x Byte sequence * @param l Length of sequence * @param nlen N-gram length * @param pos Positional n-grams * @param shift Shift value */ static void extract_ngrams(fvec_t *fv, char *x, int l, int nlen, int pos, int shift) { assert(fv && x); unsigned int i = 0, ci = 0; int sort, flen, sign; cfg_int bits; char *fstr, *t = x; fentry_t *cache = NULL; /* Get configuration */ config_lookup_bool(&cfg, "features.ngram_sort", &sort); config_lookup_int(&cfg, "features.hash_bits", &bits); config_lookup_bool(&cfg, "features.vect_sign", &sign); /* Set bits of hash mask */ feat_t hash_mask = ((long long unsigned) 2 << (bits - 1)) - 1; if (fhash_enabled()) cache = calloc(l, sizeof(fentry_t)); for (i = 1; t < x + l; i++) { /* Check for sequence end */ if (t + nlen > x + l) break; /* Copy feature string and add slack */ flen = nlen; fstr = malloc(flen + sizeof(unsigned long)); memcpy(fstr, t, nlen); /* Sorted n-grams code */ if (sort) qsort(fstr, flen, 1, chrcmp); /* Positional n-grams code */ if (pos) { int32_t p = ci + shift; memcpy(fstr + flen, &p, sizeof(int32_t)); flen += sizeof(int32_t); } feat_t h = hash_str(fstr, flen); fv->dim[fv->len] = h & hash_mask; fv->val[fv->len] = 1; /* Signed embedding */ if (sign) fv->val[fv->len] *= (signed) h > 0 ? -1 : 1; /* Cache feature */ if (fhash_enabled()) cache_put(&cache[ci], fv, fstr, flen); t++; fv->len++; ci++; free(fstr); } fv->total += fv->len; if (!fhash_enabled()) return; /* Flush cache */ cache_flush(cache, ci); free(cache); }
/** * Extracts word n-grams from a string. The features are represented * by hash values. * @param fv Feature vector * @param x Byte sequence * @param l Length of sequence * @parma nlen N-gram len * @param pos Positional n-grams * @param shift Shift value */ static void extract_wgrams(fvec_t *fv, char *x, int l, int nlen, int pos, int shift) { assert(fv && x && l > 0); int sort, sign, flen; cfg_int bits; unsigned int i, j = l, ci = 0; unsigned int dlm = 0; unsigned int fstart, fnext = 0, fnum = 0; char *t = malloc(l + 1), *fstr; fentry_t *cache = NULL; /* Get configuration */ config_lookup_bool(&cfg, "features.ngram_sort", &sort); config_lookup_int(&cfg, "features.hash_bits", &bits); config_lookup_bool(&cfg, "features.vect_sign", &sign); /* Set bits of hash mask */ feat_t hash_mask = ((long long unsigned) 2 << (bits - 1)) - 1; if (fhash_enabled()) cache = calloc(l, sizeof(fentry_t)); /* Find first delimiter symbol */ for (dlm = 0; !delim[(unsigned char) dlm] && dlm < 256; dlm++); /* Remove redundant delimiters */ for (i = 0, j = 0; i < l; i++) { if (delim[(unsigned char) x[i]]) { if (j == 0 || delim[(unsigned char) t[j - 1]]) continue; t[j++] = (char) dlm; } else { t[j++] = x[i]; } } /* No characters remaining */ if (j == 0) goto clean; /* Add trailing delimiter */ if (t[j - 1] != dlm) t[j++] = (char) dlm; /* Extract n-grams */ for (fstart = i = 0; i < j; i++) { /* Count delimiters and remember start position */ if (t[i] == dlm && ++fnum == 1) fnext = i; /* Store n-gram */ if (fnum == nlen && i - fstart > 0) { /* Copy feature string and add slack */ flen = i - fstart; fstr = malloc(flen + sizeof(unsigned long)); memcpy(fstr, t + fstart, flen); /* Sorted n-grams code */ if (sort) fstr = sort_words(fstr, flen, dlm); /* Positional n-grams code */ if (pos) { int32_t p = ci + shift; memcpy(fstr + flen, &p, sizeof(int32_t)); flen += sizeof(int32_t); } feat_t h = hash_str(fstr, flen); fv->dim[fv->len] = h & hash_mask; fv->val[fv->len] = 1; /* Signed embedding */ if (sign) fv->val[fv->len] *= (signed) h > 0 ? -1 : 1; /* Cache feature and key */ if (fhash_enabled()) cache_put(&cache[ci], fv, fstr, flen); fstart = fnext + 1, i = fnext, fnum = 0; fv->len++; ci++; free(fstr); } } /* Save extracted n-grams */ fv->total += fv->len; clean: if (fhash_enabled()) { cache_flush(cache, ci); free(cache); } free(t); }