예제 #1
0
ngram_model_t *
ngram_model_trie_read_bin(cmd_ln_t * config,
                          const char *path, logmath_t * lmath)
{
    int32 is_pipe;
    FILE *fp;
    size_t hdr_size;
    char *hdr;
    int cmp_res;
    uint8 i, order;
    uint32 counts[NGRAM_MAX_ORDER];
    ngram_model_trie_t *model;
    ngram_model_t *base;

    E_INFO("Trying to read LM in trie binary format\n");
    if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", path);
        return NULL;
    }
    hdr_size = strlen(trie_hdr);
    hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr));
    fread(hdr, sizeof(*hdr), hdr_size, fp);
    cmp_res = strcmp(hdr, trie_hdr);
    ckd_free(hdr);
    if (cmp_res) {
        E_INFO("Header doesn't match\n");
        fclose_comp(fp, is_pipe);
        return NULL;
    }
    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    base = &model->base;
    fread(&order, sizeof(order), 1, fp);
    for (i = 0; i < order; i++) {
        fread(&counts[i], sizeof(counts[i]), 1, fp);
    }
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);
    for (i = 0; i < order; i++) {
        base->n_counts[i] = counts[i];
    }

    model->trie = lm_trie_read_bin(counts, order, fp);
    read_word_str(base, fp);
    fclose_comp(fp, is_pipe);

    return base;
}
예제 #2
0
int
ngram_model_trie_write_bin(ngram_model_t * base, const char *path)
{
    int i;
    int32 is_pipe;
    ngram_model_trie_t *model = (ngram_model_trie_t *) base;
    FILE *fp = fopen_comp(path, "wb", &is_pipe);
    if (!fp) {
        E_ERROR("Unable to open %s to write binary trie LM\n", path);
        return -1;
    }

    fwrite(trie_hdr, sizeof(*trie_hdr), strlen(trie_hdr), fp);
    fwrite(&model->base.n, sizeof(model->base.n), 1, fp);
    for (i = 0; i < model->base.n; i++) {
        fwrite(&model->base.n_counts[i], sizeof(model->base.n_counts[i]),
               1, fp);
    }
    lm_trie_write_bin(model->trie, base->n_counts[0], fp);
    write_word_str(fp, base);
    fclose_comp(fp, is_pipe);
    return 0;
}
예제 #3
0
int32
read_classdef_file(hash_table_t * classes, const char *file_name)
{
    FILE *fp;
    int32 is_pipe;
    int inclass;  /**< Are we currently reading a list of class words? */
    int32 rv = -1;
    gnode_t *gn;
    glist_t classwords = NULL;
    glist_t classprobs = NULL;
    char *classname = NULL;

    if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", file_name);
        return -1;
    }

    inclass = FALSE;
    while (!feof(fp)) {
        char line[512];
        char *wptr[2];
        int n_words;

        if (fgets(line, sizeof(line), fp) == NULL)
            break;

        n_words = str2words(line, wptr, 2);
        if (n_words <= 0)
            continue;

        if (inclass) {
            /* Look for an end of class marker. */
            if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
                classdef_t *classdef;
                gnode_t *word, *weight;
                int32 i;

                if (classname == NULL || 0 != strcmp(wptr[1], classname))
                    goto error_out;
                inclass = FALSE;

                /* Construct a class from the list of words collected. */
                classdef = ckd_calloc(1, sizeof(*classdef));
                classwords = glist_reverse(classwords);
                classprobs = glist_reverse(classprobs);
                classdef->n_words = glist_count(classwords);
                classdef->words = ckd_calloc(classdef->n_words,
                                             sizeof(*classdef->words));
                classdef->weights = ckd_calloc(classdef->n_words,
                                               sizeof(*classdef->weights));
                word = classwords;
                weight = classprobs;
                for (i = 0; i < classdef->n_words; ++i) {
                    classdef->words[i] = gnode_ptr(word);
                    classdef->weights[i] = gnode_float32(weight);
                    word = gnode_next(word);
                    weight = gnode_next(weight);
                }

                /* Add this class to the hash table. */
                if (hash_table_enter(classes, classname, classdef) !=
                    classdef) {
                    classdef_free(classdef);
                    goto error_out;
                }

                /* Reset everything. */
                glist_free(classwords);
                glist_free(classprobs);
                classwords = NULL;
                classprobs = NULL;
                classname = NULL;
            }
            else {
                float32 fprob;

                if (n_words == 2)
                    fprob = atof_c(wptr[1]);
                else
                    fprob = 1.0f;
                /* Add it to the list of words for this class. */
                classwords =
                    glist_add_ptr(classwords, ckd_salloc(wptr[0]));
                classprobs = glist_add_float32(classprobs, fprob);
            }
        }
        else {
            /* Start a new LM class if the LMCLASS marker is seen */
            if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
                if (inclass)
                    goto error_out;
                inclass = TRUE;
                classname = ckd_salloc(wptr[1]);
            }
            /* Otherwise, just ignore whatever junk we got */
        }
    }
    rv = 0;                     /* Success. */

  error_out:
    /* Free all the stuff we might have allocated. */
    fclose_comp(fp, is_pipe);
    for (gn = classwords; gn; gn = gnode_next(gn))
        ckd_free(gnode_ptr(gn));
    glist_free(classwords);
    glist_free(classprobs);
    ckd_free(classname);

    return rv;
}
예제 #4
0
ngram_model_t *
ngram_model_trie_read_dmp(cmd_ln_t * config,
                          const char *file_name, logmath_t * lmath)
{
    uint8 do_swap;
    int32 is_pipe;
    int32 k;
    uint32 j;
    int32 vn, ts;
    int32 count;
    uint32 counts[3];
    uint32 fixed_counts[3];
    uint32 *unigram_next;
    int i, order;
    char str[1024];
    FILE *fp;
    ngram_model_trie_t *model;
    ngram_model_t *base;
    ngram_raw_t **raw_ngrams;

    E_INFO("Trying to read LM in DMP format\n");
    if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
        E_ERROR("Dump file %s not found\n", file_name);
        return NULL;
    }

    do_swap = FALSE;
    fread(&k, sizeof(k), 1, fp);
    if (k != strlen(dmp_hdr) + 1) {
        SWAP_INT32(&k);
        if (k != strlen(dmp_hdr) + 1) {
            E_ERROR
                ("Wrong magic header size number %x: %s is not a dump file\n",
                 k, file_name);
            return NULL;
        }
        do_swap = 1;
    }
    if (fread(str, 1, k, fp) != (size_t) k) {
        E_ERROR("Cannot read header\n");
        return NULL;
    }
    if (strncmp(str, dmp_hdr, k) != 0) {
        E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr);
        return NULL;
    }

    if (fread(&k, sizeof(k), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&k);
    if (fread(str, 1, k, fp) != (size_t) k) {
        E_ERROR("Cannot read LM filename in header\n");
        return NULL;
    }

    /* read version#, if present (must be <= 0) */
    if (fread(&vn, sizeof(vn), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&vn);
    if (vn <= 0) {
        /* read and don't compare timestamps (we don't care) */
        if (fread(&ts, sizeof(ts), 1, fp) != 1)
            return NULL;
        if (do_swap)
            SWAP_INT32(&ts);

        /* read and skip format description */
        for (;;) {
            if (fread(&k, sizeof(k), 1, fp) != 1)
                return NULL;
            if (do_swap)
                SWAP_INT32(&k);
            if (k == 0)
                break;
            if (fread(str, 1, k, fp) != (size_t) k) {
                E_ERROR("Failed to read word\n");
                return NULL;
            }
        }
        /* read model->ucount */
        if (fread(&count, sizeof(count), 1, fp) != 1)
            return NULL;
        if (do_swap)
            SWAP_INT32(&count);
        counts[0] = count;
    }
    else {
        counts[0] = vn;
    }
    /* read model->bcount, tcount */
    if (fread(&count, sizeof(count), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&count);
    counts[1] = count;
    if (fread(&count, sizeof(count), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&count);
    counts[2] = count;
    E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]);

    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    base = &model->base;
    if (counts[2] > 0)
        order = 3;
    else if (counts[1] > 0)
        order = 2;
    else
        order = 1;
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);

    model->trie = lm_trie_create(counts[0], order);

    unigram_next =
        (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next));
    for (j = 0; j <= (int32) counts[0]; j++) {
        int32 bigrams;
        dmp_weight_t weight;
        /* Skip over the mapping ID, we don't care about it. */
        fread(&bigrams, sizeof(int32), 1, fp);
        /* Read the weights from actual unigram structure. */
        fread(&weight, sizeof(weight), 1, fp);
        if (do_swap)
            SWAP_INT32(&weight.l);
        weight.f = logmath_log10_to_log_float(lmath, weight.f);
        model->trie->unigrams[j].prob = weight.f;
        fread(&weight, sizeof(weight), 1, fp);
        if (do_swap)
            SWAP_INT32(&weight.l);
        weight.f = logmath_log10_to_log_float(lmath, weight.f);
        model->trie->unigrams[j].bo = weight.f;
        //store pointer to dmp next to recognize wid
        fread(&bigrams, sizeof(int32), 1, fp);
        if (do_swap)
            SWAP_INT32(&bigrams);
        model->trie->unigrams[j].next = bigrams;
        unigram_next[j] = bigrams;
    }

    if (order > 1) {
        raw_ngrams =
            ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next,
                                do_swap);
        ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
        for (i = 0; i < order; i++) {
            base->n_counts[i] = fixed_counts[i];
        }

        //build reversed trie
        lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts,
                            order);
        lm_trie_build(model->trie, raw_ngrams, counts, order);
        counts[1]++;

        //free raw ngrams
        ngrams_raw_free(raw_ngrams, counts, order);
    }
    ckd_free(unigram_next);

    /* read ascii word strings */
    read_word_str(base, fp);

    fclose_comp(fp, is_pipe);
    return base;
}
예제 #5
0
ngram_model_t *
ngram_model_trie_read_arpa(cmd_ln_t * config,
                           const char *path, logmath_t * lmath)
{
    FILE *fp;
    lineiter_t *li;
    ngram_model_trie_t *model;
    ngram_model_t *base;
    ngram_raw_t **raw_ngrams;
    int32 is_pipe;
    uint32 counts[NGRAM_MAX_ORDER];
    uint32 fixed_counts[NGRAM_MAX_ORDER];
    int order;
    int i;

    E_INFO("Trying to read LM in arpa format\n");
    if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", path);
        return NULL;
    }

    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    li = lineiter_start_clean(fp);
    /* Read n-gram counts from file */
    if (read_counts_arpa(&li, counts, &order) == -1) {
        ckd_free(model);
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }

    E_INFO("LM of order %d\n", order);
    for (i = 0; i < order; i++) {
        E_INFO("#%d-grams: %d\n", i + 1, counts[i]);
    }

    base = &model->base;
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);
    base->writable = TRUE;

    model->trie = lm_trie_create(counts[0], order);
    if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) {
        ckd_free(model);
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }

    if (order > 1) {
        raw_ngrams =
            ngrams_raw_read_arpa(&li, base->lmath, counts, order,
                                 base->wid);
        ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
        for (i = 0; i < order; i++) {
            base->n_counts[i] = fixed_counts[i];
        }
        lm_trie_alloc_ngram(model->trie, fixed_counts, order);
        lm_trie_build(model->trie, raw_ngrams, counts, order);
        ngrams_raw_free(raw_ngrams, counts, order);
    }

    lineiter_free(li);
    fclose_comp(fp, is_pipe);

    return base;
}
예제 #6
0
int
srch_FLAT_FWD_begin(void *srch)
{
    srch_FLAT_FWD_graph_t *fwg;
    srch_t *s;
    kbcore_t *kbc;
    int32 w, ispipe;
    char str[1024];
    FILE *fp;
    dict_t *dict;

    s = (srch_t *) srch;
    fwg = (srch_FLAT_FWD_graph_t *) s->grh->graph_struct;
    kbc = s->kbc;
    dict = kbcore_dict(kbc);

    assert(fwg);

    ptmr_reset(&(fwg->tm_hmmeval));
    ptmr_reset(&(fwg->tm_hmmtrans));
    ptmr_reset(&(fwg->tm_wdtrans));

    latticehist_reset(fwg->lathist);

    /* If input lattice file containing word candidates to be searched specified; use it */
    if (fwg->word_cand_dir) {
        ctl_outfile(str, fwg->word_cand_dir, fwg->latfile_ext,
                    (s->uttfile ? s->uttfile : s->uttid), s->uttid,
                    cmd_ln_boolean_r(kbcore_config(s->kbc), "-build_outdirs"));
        E_INFO("Reading input lattice: %s\n", str);

        if ((fp = fopen_compchk(str, &ispipe)) == NULL)
            E_ERROR("fopen_compchk(%s) failed; running full search\n",
                    str);
        else {
            if ((fwg->n_word_cand =
                 word_cand_load(fp, fwg->word_cand, dict,
                                s->uttid)) <= 0) {
                E_ERROR("Bad or empty lattice file: %s; ignored\n", str);
                word_cand_free(fwg->word_cand);
                fwg->n_word_cand = 0;
            }
            else
                E_INFO("%d lattice entries read\n", fwg->n_word_cand);

            fclose_comp(fp, ispipe);
        }
    }

    if (fwg->n_word_cand > 0)
        latticehist_n_cand(fwg->lathist) = fwg->n_word_cand;

    /* Enter all pronunciations of startwid (begin silence) */


    fwg->n_frm = -1;
    for (w = dict->startwid; IS_S3WID(w); w = dict->word[w].alt)
        word_enter(fwg, w, 0, BAD_S3LATID,
                   dict->word[dict->silwid].ciphone[dict->
                                                    word[dict->silwid].
                                                    pronlen - 1]);

    fwg->renormalized = 0;
    fwg->n_frm = 0;

#if 0
    E_INFO("After\n");

    dump_all_whmm(fwg, fwg->whmm, fwg->n_frm, fwg->n_state, NULL);
#endif

    return SRCH_SUCCESS;

}