コード例 #1
0
static void
build_widmap(ngram_model_t * base, logmath_t * lmath, int32 n)
{
    ngram_model_set_t *set = (ngram_model_set_t *) base;
    ngram_model_t **models = set->lms;
    hash_table_t *vocab;
    glist_t hlist;
    gnode_t *gn;
    int32 i;

    /* Construct a merged vocabulary and a set of word-ID mappings. */
    vocab = hash_table_new(models[0]->n_words, FALSE);
    /* Create the set of merged words. */
    for (i = 0; i < set->n_models; ++i) {
        int32 j;
        for (j = 0; j < models[i]->n_words; ++j) {
            /* Ignore collisions. */
            (void) hash_table_enter_int32(vocab, models[i]->word_str[j],
                                          j);
        }
    }
    /* Create the array of words, then sort it. */
    if (hash_table_lookup(vocab, "<UNK>", NULL) != 0)
        (void) hash_table_enter_int32(vocab, "<UNK>", 0);
    /* Now we know the number of unigrams, initialize the base model. */
    ngram_model_init(base, &ngram_model_set_funcs, lmath, n,
                     hash_table_inuse(vocab));
    base->writable = FALSE;     /* We will reuse the pointers from the submodels. */
    i = 0;
    hlist = hash_table_tolist(vocab, NULL);
    for (gn = hlist; gn; gn = gnode_next(gn)) {
        hash_entry_t *ent = gnode_ptr(gn);
        base->word_str[i++] = (char *) ent->key;
    }
    glist_free(hlist);
    qsort(base->word_str, base->n_words, sizeof(*base->word_str),
          my_compare);

    /* Now create the word ID mappings. */
    if (set->widmap)
        ckd_free_2d((void **) set->widmap);
    set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models,
                                           sizeof(**set->widmap));
    for (i = 0; i < base->n_words; ++i) {
        int32 j;
        /* Also create the master wid mapping. */
        (void) hash_table_enter_int32(base->wid, base->word_str[i], i);
        /* printf("%s: %d => ", base->word_str[i], i); */
        for (j = 0; j < set->n_models; ++j) {
            set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]);
            /* printf("%d ", set->widmap[i][j]); */
        }
        /* printf("\n"); */
    }
    hash_table_free(vocab);
}
コード例 #2
0
ファイル: ngram_model_trie.c プロジェクト: day279/sphinxbase
ngram_model_t *
ngram_model_trie_read_bin(cmd_ln_t * config,
                          const char *path, logmath_t * lmath)
{
    int32 is_pipe;
    FILE *fp;
    size_t hdr_size;
    char *hdr;
    int cmp_res;
    uint8 i, order;
    uint32 counts[NGRAM_MAX_ORDER];
    ngram_model_trie_t *model;
    ngram_model_t *base;

    E_INFO("Trying to read LM in trie binary format\n");
    if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", path);
        return NULL;
    }
    hdr_size = strlen(trie_hdr);
    hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr));
    fread(hdr, sizeof(*hdr), hdr_size, fp);
    cmp_res = strcmp(hdr, trie_hdr);
    ckd_free(hdr);
    if (cmp_res) {
        E_INFO("Header doesn't match\n");
        fclose_comp(fp, is_pipe);
        return NULL;
    }
    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    base = &model->base;
    fread(&order, sizeof(order), 1, fp);
    for (i = 0; i < order; i++) {
        fread(&counts[i], sizeof(counts[i]), 1, fp);
    }
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);
    for (i = 0; i < order; i++) {
        base->n_counts[i] = counts[i];
    }

    model->trie = lm_trie_read_bin(counts, order, fp);
    read_word_str(base, fp);
    fclose_comp(fp, is_pipe);

    return base;
}
コード例 #3
0
ファイル: ngram_model_trie.c プロジェクト: day279/sphinxbase
ngram_model_t *
ngram_model_trie_read_dmp(cmd_ln_t * config,
                          const char *file_name, logmath_t * lmath)
{
    uint8 do_swap;
    int32 is_pipe;
    int32 k;
    uint32 j;
    int32 vn, ts;
    int32 count;
    uint32 counts[3];
    uint32 fixed_counts[3];
    uint32 *unigram_next;
    int i, order;
    char str[1024];
    FILE *fp;
    ngram_model_trie_t *model;
    ngram_model_t *base;
    ngram_raw_t **raw_ngrams;

    E_INFO("Trying to read LM in DMP format\n");
    if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
        E_ERROR("Dump file %s not found\n", file_name);
        return NULL;
    }

    do_swap = FALSE;
    fread(&k, sizeof(k), 1, fp);
    if (k != strlen(dmp_hdr) + 1) {
        SWAP_INT32(&k);
        if (k != strlen(dmp_hdr) + 1) {
            E_ERROR
                ("Wrong magic header size number %x: %s is not a dump file\n",
                 k, file_name);
            return NULL;
        }
        do_swap = 1;
    }
    if (fread(str, 1, k, fp) != (size_t) k) {
        E_ERROR("Cannot read header\n");
        return NULL;
    }
    if (strncmp(str, dmp_hdr, k) != 0) {
        E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr);
        return NULL;
    }

    if (fread(&k, sizeof(k), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&k);
    if (fread(str, 1, k, fp) != (size_t) k) {
        E_ERROR("Cannot read LM filename in header\n");
        return NULL;
    }

    /* read version#, if present (must be <= 0) */
    if (fread(&vn, sizeof(vn), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&vn);
    if (vn <= 0) {
        /* read and don't compare timestamps (we don't care) */
        if (fread(&ts, sizeof(ts), 1, fp) != 1)
            return NULL;
        if (do_swap)
            SWAP_INT32(&ts);

        /* read and skip format description */
        for (;;) {
            if (fread(&k, sizeof(k), 1, fp) != 1)
                return NULL;
            if (do_swap)
                SWAP_INT32(&k);
            if (k == 0)
                break;
            if (fread(str, 1, k, fp) != (size_t) k) {
                E_ERROR("Failed to read word\n");
                return NULL;
            }
        }
        /* read model->ucount */
        if (fread(&count, sizeof(count), 1, fp) != 1)
            return NULL;
        if (do_swap)
            SWAP_INT32(&count);
        counts[0] = count;
    }
    else {
        counts[0] = vn;
    }
    /* read model->bcount, tcount */
    if (fread(&count, sizeof(count), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&count);
    counts[1] = count;
    if (fread(&count, sizeof(count), 1, fp) != 1)
        return NULL;
    if (do_swap)
        SWAP_INT32(&count);
    counts[2] = count;
    E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]);

    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    base = &model->base;
    if (counts[2] > 0)
        order = 3;
    else if (counts[1] > 0)
        order = 2;
    else
        order = 1;
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);

    model->trie = lm_trie_create(counts[0], order);

    unigram_next =
        (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next));
    for (j = 0; j <= (int32) counts[0]; j++) {
        int32 bigrams;
        dmp_weight_t weight;
        /* Skip over the mapping ID, we don't care about it. */
        fread(&bigrams, sizeof(int32), 1, fp);
        /* Read the weights from actual unigram structure. */
        fread(&weight, sizeof(weight), 1, fp);
        if (do_swap)
            SWAP_INT32(&weight.l);
        weight.f = logmath_log10_to_log_float(lmath, weight.f);
        model->trie->unigrams[j].prob = weight.f;
        fread(&weight, sizeof(weight), 1, fp);
        if (do_swap)
            SWAP_INT32(&weight.l);
        weight.f = logmath_log10_to_log_float(lmath, weight.f);
        model->trie->unigrams[j].bo = weight.f;
        //store pointer to dmp next to recognize wid
        fread(&bigrams, sizeof(int32), 1, fp);
        if (do_swap)
            SWAP_INT32(&bigrams);
        model->trie->unigrams[j].next = bigrams;
        unigram_next[j] = bigrams;
    }

    if (order > 1) {
        raw_ngrams =
            ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next,
                                do_swap);
        ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
        for (i = 0; i < order; i++) {
            base->n_counts[i] = fixed_counts[i];
        }

        //build reversed trie
        lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts,
                            order);
        lm_trie_build(model->trie, raw_ngrams, counts, order);
        counts[1]++;

        //free raw ngrams
        ngrams_raw_free(raw_ngrams, counts, order);
    }
    ckd_free(unigram_next);

    /* read ascii word strings */
    read_word_str(base, fp);

    fclose_comp(fp, is_pipe);
    return base;
}
コード例 #4
0
ファイル: ngram_model_trie.c プロジェクト: day279/sphinxbase
ngram_model_t *
ngram_model_trie_read_arpa(cmd_ln_t * config,
                           const char *path, logmath_t * lmath)
{
    FILE *fp;
    lineiter_t *li;
    ngram_model_trie_t *model;
    ngram_model_t *base;
    ngram_raw_t **raw_ngrams;
    int32 is_pipe;
    uint32 counts[NGRAM_MAX_ORDER];
    uint32 fixed_counts[NGRAM_MAX_ORDER];
    int order;
    int i;

    E_INFO("Trying to read LM in arpa format\n");
    if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", path);
        return NULL;
    }

    model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
    li = lineiter_start_clean(fp);
    /* Read n-gram counts from file */
    if (read_counts_arpa(&li, counts, &order) == -1) {
        ckd_free(model);
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }

    E_INFO("LM of order %d\n", order);
    for (i = 0; i < order; i++) {
        E_INFO("#%d-grams: %d\n", i + 1, counts[i]);
    }

    base = &model->base;
    ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
                     (int32) counts[0]);
    base->writable = TRUE;

    model->trie = lm_trie_create(counts[0], order);
    if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) {
        ckd_free(model);
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }

    if (order > 1) {
        raw_ngrams =
            ngrams_raw_read_arpa(&li, base->lmath, counts, order,
                                 base->wid);
        ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
        for (i = 0; i < order; i++) {
            base->n_counts[i] = fixed_counts[i];
        }
        lm_trie_alloc_ngram(model->trie, fixed_counts, order);
        lm_trie_build(model->trie, raw_ngrams, counts, order);
        ngrams_raw_free(raw_ngrams, counts, order);
    }

    lineiter_free(li);
    fclose_comp(fp, is_pipe);

    return base;
}