lm_trie_t* lm_trie_read_bin(uint32 *counts, int order, FILE *fp) { lm_trie_t* trie = lm_trie_init(counts[0]); trie->quant = (order > 1) ? lm_trie_quant_read_bin(fp, order) : NULL; fread(trie->unigrams, sizeof(*trie->unigrams), (counts[0] + 1), fp); if (order > 1) { lm_trie_alloc_ngram(trie, counts, order); fread(trie->ngram_mem, 1, trie->ngram_mem_size, fp); } return trie; }
ngram_model_t * ngram_model_trie_read_dmp(cmd_ln_t * config, const char *file_name, logmath_t * lmath) { uint8 do_swap; int32 is_pipe; int32 k; uint32 j; int32 vn, ts; int32 count; uint32 counts[3]; uint32 fixed_counts[3]; uint32 *unigram_next; int i, order; char str[1024]; FILE *fp; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; E_INFO("Trying to read LM in DMP format\n"); if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { E_ERROR("Dump file %s not found\n", file_name); return NULL; } do_swap = FALSE; fread(&k, sizeof(k), 1, fp); if (k != strlen(dmp_hdr) + 1) { SWAP_INT32(&k); if (k != strlen(dmp_hdr) + 1) { E_ERROR ("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); return NULL; } do_swap = 1; } if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read header\n"); return NULL; } if (strncmp(str, dmp_hdr, k) != 0) { E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr); return NULL; } if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read LM filename in header\n"); return NULL; } /* read version#, if present (must be <= 0) */ if (fread(&vn, sizeof(vn), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&vn); if (vn <= 0) { /* read and don't compare timestamps (we don't care) */ if (fread(&ts, sizeof(ts), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&ts); /* read and skip format description */ for (;;) { if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (k == 0) break; if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Failed to read word\n"); return NULL; } } /* read model->ucount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[0] = count; } else { counts[0] = vn; } /* read model->bcount, tcount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[1] = count; if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[2] = count; E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]); model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; if (counts[2] > 0) order = 3; else if (counts[1] > 0) order = 2; else order = 1; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); model->trie = lm_trie_create(counts[0], order); unigram_next = (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next)); for (j = 0; j <= (int32) counts[0]; j++) { int32 bigrams; dmp_weight_t weight; /* Skip over the mapping ID, we don't care about it. */ fread(&bigrams, sizeof(int32), 1, fp); /* Read the weights from actual unigram structure. */ fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].prob = weight.f; fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].bo = weight.f; //store pointer to dmp next to recognize wid fread(&bigrams, sizeof(int32), 1, fp); if (do_swap) SWAP_INT32(&bigrams); model->trie->unigrams[j].next = bigrams; unigram_next[j] = bigrams; } if (order > 1) { raw_ngrams = ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next, do_swap); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } //build reversed trie lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); counts[1]++; //free raw ngrams ngrams_raw_free(raw_ngrams, counts, order); } ckd_free(unigram_next); /* read ascii word strings */ read_word_str(base, fp); fclose_comp(fp, is_pipe); return base; }
ngram_model_t * ngram_model_trie_read_arpa(cmd_ln_t * config, const char *path, logmath_t * lmath) { FILE *fp; lineiter_t *li; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; int32 is_pipe; uint32 counts[NGRAM_MAX_ORDER]; uint32 fixed_counts[NGRAM_MAX_ORDER]; int order; int i; E_INFO("Trying to read LM in arpa format\n"); if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); li = lineiter_start_clean(fp); /* Read n-gram counts from file */ if (read_counts_arpa(&li, counts, &order) == -1) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } E_INFO("LM of order %d\n", order); for (i = 0; i < order; i++) { E_INFO("#%d-grams: %d\n", i + 1, counts[i]); } base = &model->base; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); base->writable = TRUE; model->trie = lm_trie_create(counts[0], order); if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } if (order > 1) { raw_ngrams = ngrams_raw_read_arpa(&li, base->lmath, counts, order, base->wid); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } lm_trie_alloc_ngram(model->trie, fixed_counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); ngrams_raw_free(raw_ngrams, counts, order); } lineiter_free(li); fclose_comp(fp, is_pipe); return base; }