Example #1
0
lm_trie_t* lm_trie_read_bin(uint32 *counts, int order, FILE *fp)
{
    lm_trie_t* trie = lm_trie_init(counts[0]);
    trie->quant = (order > 1) ? lm_trie_quant_read_bin(fp, order) : NULL;
    fread(trie->unigrams, sizeof(*trie->unigrams), (counts[0] + 1), fp);
    if (order > 1) {
        lm_trie_alloc_ngram(trie, counts, order);
        fread(trie->ngram_mem, 1, trie->ngram_mem_size, fp);
    }
    return trie;
}
Example #2
0
ngram_model_t *
ngram_model_trie_read_dmp(cmd_ln_t * config,
                          const char *file_name, logmath_t * lmath)
{
    uint8 do_swap;
    int32 is_pipe;
    int32 k;
    uint32 j;
    int32 vn, ts;
    int32 count;
    uint32 counts[3];
    uint32 fixed_counts[3];
    uint32 *unigram_next;
    int i, order;
    char str[1024];
    FILE *fp;
    ngram_model_trie_t *model;
    ngram_model_t *base;
    ngram_raw_t **raw_ngrams;

    E_INFO("Trying to read LM in DMP format\n");
    if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
        E_ERROR("Dump file %s not found\n", file_name);
        return NULL;
    }

    do_swap = FALSE;
    fread(&k, sizeof(k), 1, fp);
    if (k != strlen(dmp_hdr) + 1) {
        SWAP_INT32(&k);
        if (k != strlen(dmp_hdr) + 1) {
            E_ERROR
                ("Wrong magic header size number %x: %s is not a dump file\n",
                 k, file_name);
            return NULL;
        }
        do_swap = 1;
    }
    if (fread(str, 1, k, fp) != (size_t) k) {
        E_ERROR("Cannot read header\n");
        return NULL;
    }
    if (strncmp(str, dmp_hdr, k) != 0) {
        E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr);
        return NULL;
    }

    if (fread(&k, sizeof(k), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&k);
    if (fread(str, 1, k, fp) != (size_t) k) {
        E_ERROR("Cannot read LM filename in header\n");
        return NULL;
    }

    /* read version#, if present (must be <= 0) */
    if (fread(&vn, sizeof(vn), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&vn);
    if (vn <= 0) {
        /* read and don't compare timestamps (we don't care) */
        if (fread(&ts, sizeof(ts), 1, fp) != 1)
            return NULL;
        if (do_swap)
            SWAP_INT32(&ts);

        /* read and skip format description */
        for (;;) {
            if (fread(&k, sizeof(k), 1, fp) != 1)
                return NULL;
            if (do_swap)
                SWAP_INT32(&k);
            if (k == 0)
                break;
            if (fread(str, 1, k, fp) != (size_t) k) {
                E_ERROR("Failed to read word\n");
                return NULL;
            }
        }
        /* read model->ucount */
        if (fread(&count, sizeof(count), 1, fp) != 1)
            return NULL;
        if (do_swap)
            SWAP_INT32(&count);
        counts[0] = count;
    }
    else {
        counts[0] = vn;
    }
    /* read model->bcount, tcount */
    if (fread(&count, sizeof(count), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&count);
    counts[1] = count;
    if (fread(&count, sizeof(count), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&count);
    counts[2] = count;
    E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]);

    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    base = &model->base;
    if (counts[2] > 0)
        order = 3;
    else if (counts[1] > 0)
        order = 2;
    else
        order = 1;
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);

    model->trie = lm_trie_create(counts[0], order);

    unigram_next =
        (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next));
    for (j = 0; j <= (int32) counts[0]; j++) {
        int32 bigrams;
        dmp_weight_t weight;
        /* Skip over the mapping ID, we don't care about it. */
        fread(&bigrams, sizeof(int32), 1, fp);
        /* Read the weights from actual unigram structure. */
        fread(&weight, sizeof(weight), 1, fp);
        if (do_swap)
            SWAP_INT32(&weight.l);
        weight.f = logmath_log10_to_log_float(lmath, weight.f);
        model->trie->unigrams[j].prob = weight.f;
        fread(&weight, sizeof(weight), 1, fp);
        if (do_swap)
            SWAP_INT32(&weight.l);
        weight.f = logmath_log10_to_log_float(lmath, weight.f);
        model->trie->unigrams[j].bo = weight.f;
        //store pointer to dmp next to recognize wid
        fread(&bigrams, sizeof(int32), 1, fp);
        if (do_swap)
            SWAP_INT32(&bigrams);
        model->trie->unigrams[j].next = bigrams;
        unigram_next[j] = bigrams;
    }

    if (order > 1) {
        raw_ngrams =
            ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next,
                                do_swap);
        ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
        for (i = 0; i < order; i++) {
            base->n_counts[i] = fixed_counts[i];
        }

        //build reversed trie
        lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts,
                            order);
        lm_trie_build(model->trie, raw_ngrams, counts, order);
        counts[1]++;

        //free raw ngrams
        ngrams_raw_free(raw_ngrams, counts, order);
    }
    ckd_free(unigram_next);

    /* read ascii word strings */
    read_word_str(base, fp);

    fclose_comp(fp, is_pipe);
    return base;
}
Example #3
0
ngram_model_t *
ngram_model_trie_read_arpa(cmd_ln_t * config,
                           const char *path, logmath_t * lmath)
{
    FILE *fp;
    lineiter_t *li;
    ngram_model_trie_t *model;
    ngram_model_t *base;
    ngram_raw_t **raw_ngrams;
    int32 is_pipe;
    uint32 counts[NGRAM_MAX_ORDER];
    uint32 fixed_counts[NGRAM_MAX_ORDER];
    int order;
    int i;

    E_INFO("Trying to read LM in arpa format\n");
    if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", path);
        return NULL;
    }

    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    li = lineiter_start_clean(fp);
    /* Read n-gram counts from file */
    if (read_counts_arpa(&li, counts, &order) == -1) {
        ckd_free(model);
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }

    E_INFO("LM of order %d\n", order);
    for (i = 0; i < order; i++) {
        E_INFO("#%d-grams: %d\n", i + 1, counts[i]);
    }

    base = &model->base;
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);
    base->writable = TRUE;

    model->trie = lm_trie_create(counts[0], order);
    if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) {
        ckd_free(model);
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }

    if (order > 1) {
        raw_ngrams =
            ngrams_raw_read_arpa(&li, base->lmath, counts, order,
                                 base->wid);
        ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
        for (i = 0; i < order; i++) {
            base->n_counts[i] = fixed_counts[i];
        }
        lm_trie_alloc_ngram(model->trie, fixed_counts, order);
        lm_trie_build(model->trie, raw_ngrams, counts, order);
        ngrams_raw_free(raw_ngrams, counts, order);
    }

    lineiter_free(li);
    fclose_comp(fp, is_pipe);

    return base;
}