static void build_widmap(ngram_model_t * base, logmath_t * lmath, int32 n) { ngram_model_set_t *set = (ngram_model_set_t *) base; ngram_model_t **models = set->lms; hash_table_t *vocab; glist_t hlist; gnode_t *gn; int32 i; /* Construct a merged vocabulary and a set of word-ID mappings. */ vocab = hash_table_new(models[0]->n_words, FALSE); /* Create the set of merged words. */ for (i = 0; i < set->n_models; ++i) { int32 j; for (j = 0; j < models[i]->n_words; ++j) { /* Ignore collisions. */ (void) hash_table_enter_int32(vocab, models[i]->word_str[j], j); } } /* Create the array of words, then sort it. */ if (hash_table_lookup(vocab, "<UNK>", NULL) != 0) (void) hash_table_enter_int32(vocab, "<UNK>", 0); /* Now we know the number of unigrams, initialize the base model. */ ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab)); base->writable = FALSE; /* We will reuse the pointers from the submodels. */ i = 0; hlist = hash_table_tolist(vocab, NULL); for (gn = hlist; gn; gn = gnode_next(gn)) { hash_entry_t *ent = gnode_ptr(gn); base->word_str[i++] = (char *) ent->key; } glist_free(hlist); qsort(base->word_str, base->n_words, sizeof(*base->word_str), my_compare); /* Now create the word ID mappings. */ if (set->widmap) ckd_free_2d((void **) set->widmap); set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models, sizeof(**set->widmap)); for (i = 0; i < base->n_words; ++i) { int32 j; /* Also create the master wid mapping. */ (void) hash_table_enter_int32(base->wid, base->word_str[i], i); /* printf("%s: %d => ", base->word_str[i], i); */ for (j = 0; j < set->n_models; ++j) { set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]); /* printf("%d ", set->widmap[i][j]); */ } /* printf("\n"); */ } hash_table_free(vocab); }
ngram_model_t * ngram_model_trie_read_bin(cmd_ln_t * config, const char *path, logmath_t * lmath) { int32 is_pipe; FILE *fp; size_t hdr_size; char *hdr; int cmp_res; uint8 i, order; uint32 counts[NGRAM_MAX_ORDER]; ngram_model_trie_t *model; ngram_model_t *base; E_INFO("Trying to read LM in trie binary format\n"); if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } hdr_size = strlen(trie_hdr); hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr)); fread(hdr, sizeof(*hdr), hdr_size, fp); cmp_res = strcmp(hdr, trie_hdr); ckd_free(hdr); if (cmp_res) { E_INFO("Header doesn't match\n"); fclose_comp(fp, is_pipe); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; fread(&order, sizeof(order), 1, fp); for (i = 0; i < order; i++) { fread(&counts[i], sizeof(counts[i]), 1, fp); } ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); for (i = 0; i < order; i++) { base->n_counts[i] = counts[i]; } model->trie = lm_trie_read_bin(counts, order, fp); read_word_str(base, fp); fclose_comp(fp, is_pipe); return base; }
ngram_model_t * ngram_model_trie_read_dmp(cmd_ln_t * config, const char *file_name, logmath_t * lmath) { uint8 do_swap; int32 is_pipe; int32 k; uint32 j; int32 vn, ts; int32 count; uint32 counts[3]; uint32 fixed_counts[3]; uint32 *unigram_next; int i, order; char str[1024]; FILE *fp; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; E_INFO("Trying to read LM in DMP format\n"); if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { E_ERROR("Dump file %s not found\n", file_name); return NULL; } do_swap = FALSE; fread(&k, sizeof(k), 1, fp); if (k != strlen(dmp_hdr) + 1) { SWAP_INT32(&k); if (k != strlen(dmp_hdr) + 1) { E_ERROR ("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); return NULL; } do_swap = 1; } if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read header\n"); return NULL; } if (strncmp(str, dmp_hdr, k) != 0) { E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr); return NULL; } if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read LM filename in header\n"); return NULL; } /* read version#, if present (must be <= 0) */ if (fread(&vn, sizeof(vn), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&vn); if (vn <= 0) { /* read and don't compare timestamps (we don't care) */ if (fread(&ts, sizeof(ts), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&ts); /* read and skip format description */ for (;;) { if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (k == 0) break; if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Failed to read word\n"); return NULL; } } /* read model->ucount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[0] = count; } else { counts[0] = vn; } /* read model->bcount, tcount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[1] = count; if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[2] = count; E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]); model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; if (counts[2] > 0) order = 3; else if (counts[1] > 0) order = 2; else order = 1; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); model->trie = lm_trie_create(counts[0], order); unigram_next = (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next)); for (j = 0; j <= (int32) counts[0]; j++) { int32 bigrams; dmp_weight_t weight; /* Skip over the mapping ID, we don't care about it. */ fread(&bigrams, sizeof(int32), 1, fp); /* Read the weights from actual unigram structure. */ fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].prob = weight.f; fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].bo = weight.f; //store pointer to dmp next to recognize wid fread(&bigrams, sizeof(int32), 1, fp); if (do_swap) SWAP_INT32(&bigrams); model->trie->unigrams[j].next = bigrams; unigram_next[j] = bigrams; } if (order > 1) { raw_ngrams = ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next, do_swap); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } //build reversed trie lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); counts[1]++; //free raw ngrams ngrams_raw_free(raw_ngrams, counts, order); } ckd_free(unigram_next); /* read ascii word strings */ read_word_str(base, fp); fclose_comp(fp, is_pipe); return base; }
ngram_model_t * ngram_model_trie_read_arpa(cmd_ln_t * config, const char *path, logmath_t * lmath) { FILE *fp; lineiter_t *li; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; int32 is_pipe; uint32 counts[NGRAM_MAX_ORDER]; uint32 fixed_counts[NGRAM_MAX_ORDER]; int order; int i; E_INFO("Trying to read LM in arpa format\n"); if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); li = lineiter_start_clean(fp); /* Read n-gram counts from file */ if (read_counts_arpa(&li, counts, &order) == -1) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } E_INFO("LM of order %d\n", order); for (i = 0; i < order; i++) { E_INFO("#%d-grams: %d\n", i + 1, counts[i]); } base = &model->base; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); base->writable = TRUE; model->trie = lm_trie_create(counts[0], order); if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } if (order > 1) { raw_ngrams = ngrams_raw_read_arpa(&li, base->lmath, counts, order, base->wid); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } lm_trie_alloc_ngram(model->trie, fixed_counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); ngrams_raw_free(raw_ngrams, counts, order); } lineiter_free(li); fclose_comp(fp, is_pipe); return base; }