ngram_model_t * ngram_model_trie_read_bin(cmd_ln_t * config, const char *path, logmath_t * lmath) { int32 is_pipe; FILE *fp; size_t hdr_size; char *hdr; int cmp_res; uint8 i, order; uint32 counts[NGRAM_MAX_ORDER]; ngram_model_trie_t *model; ngram_model_t *base; E_INFO("Trying to read LM in trie binary format\n"); if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } hdr_size = strlen(trie_hdr); hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr)); fread(hdr, sizeof(*hdr), hdr_size, fp); cmp_res = strcmp(hdr, trie_hdr); ckd_free(hdr); if (cmp_res) { E_INFO("Header doesn't match\n"); fclose_comp(fp, is_pipe); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; fread(&order, sizeof(order), 1, fp); for (i = 0; i < order; i++) { fread(&counts[i], sizeof(counts[i]), 1, fp); } ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); for (i = 0; i < order; i++) { base->n_counts[i] = counts[i]; } model->trie = lm_trie_read_bin(counts, order, fp); read_word_str(base, fp); fclose_comp(fp, is_pipe); return base; }
int ngram_model_trie_write_bin(ngram_model_t * base, const char *path) { int i; int32 is_pipe; ngram_model_trie_t *model = (ngram_model_trie_t *) base; FILE *fp = fopen_comp(path, "wb", &is_pipe); if (!fp) { E_ERROR("Unable to open %s to write binary trie LM\n", path); return -1; } fwrite(trie_hdr, sizeof(*trie_hdr), strlen(trie_hdr), fp); fwrite(&model->base.n, sizeof(model->base.n), 1, fp); for (i = 0; i < model->base.n; i++) { fwrite(&model->base.n_counts[i], sizeof(model->base.n_counts[i]), 1, fp); } lm_trie_write_bin(model->trie, base->n_counts[0], fp); write_word_str(fp, base); fclose_comp(fp, is_pipe); return 0; }
int32 read_classdef_file(hash_table_t * classes, const char *file_name) { FILE *fp; int32 is_pipe; int inclass; /**< Are we currently reading a list of class words? */ int32 rv = -1; gnode_t *gn; glist_t classwords = NULL; glist_t classprobs = NULL; char *classname = NULL; if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", file_name); return -1; } inclass = FALSE; while (!feof(fp)) { char line[512]; char *wptr[2]; int n_words; if (fgets(line, sizeof(line), fp) == NULL) break; n_words = str2words(line, wptr, 2); if (n_words <= 0) continue; if (inclass) { /* Look for an end of class marker. */ if (n_words == 2 && 0 == strcmp(wptr[0], "END")) { classdef_t *classdef; gnode_t *word, *weight; int32 i; if (classname == NULL || 0 != strcmp(wptr[1], classname)) goto error_out; inclass = FALSE; /* Construct a class from the list of words collected. */ classdef = ckd_calloc(1, sizeof(*classdef)); classwords = glist_reverse(classwords); classprobs = glist_reverse(classprobs); classdef->n_words = glist_count(classwords); classdef->words = ckd_calloc(classdef->n_words, sizeof(*classdef->words)); classdef->weights = ckd_calloc(classdef->n_words, sizeof(*classdef->weights)); word = classwords; weight = classprobs; for (i = 0; i < classdef->n_words; ++i) { classdef->words[i] = gnode_ptr(word); classdef->weights[i] = gnode_float32(weight); word = gnode_next(word); weight = gnode_next(weight); } /* Add this class to the hash table. */ if (hash_table_enter(classes, classname, classdef) != classdef) { classdef_free(classdef); goto error_out; } /* Reset everything. */ glist_free(classwords); glist_free(classprobs); classwords = NULL; classprobs = NULL; classname = NULL; } else { float32 fprob; if (n_words == 2) fprob = atof_c(wptr[1]); else fprob = 1.0f; /* Add it to the list of words for this class. */ classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0])); classprobs = glist_add_float32(classprobs, fprob); } } else { /* Start a new LM class if the LMCLASS marker is seen */ if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) { if (inclass) goto error_out; inclass = TRUE; classname = ckd_salloc(wptr[1]); } /* Otherwise, just ignore whatever junk we got */ } } rv = 0; /* Success. */ error_out: /* Free all the stuff we might have allocated. */ fclose_comp(fp, is_pipe); for (gn = classwords; gn; gn = gnode_next(gn)) ckd_free(gnode_ptr(gn)); glist_free(classwords); glist_free(classprobs); ckd_free(classname); return rv; }
ngram_model_t * ngram_model_trie_read_dmp(cmd_ln_t * config, const char *file_name, logmath_t * lmath) { uint8 do_swap; int32 is_pipe; int32 k; uint32 j; int32 vn, ts; int32 count; uint32 counts[3]; uint32 fixed_counts[3]; uint32 *unigram_next; int i, order; char str[1024]; FILE *fp; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; E_INFO("Trying to read LM in DMP format\n"); if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { E_ERROR("Dump file %s not found\n", file_name); return NULL; } do_swap = FALSE; fread(&k, sizeof(k), 1, fp); if (k != strlen(dmp_hdr) + 1) { SWAP_INT32(&k); if (k != strlen(dmp_hdr) + 1) { E_ERROR ("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); return NULL; } do_swap = 1; } if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read header\n"); return NULL; } if (strncmp(str, dmp_hdr, k) != 0) { E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr); return NULL; } if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Cannot read LM filename in header\n"); return NULL; } /* read version#, if present (must be <= 0) */ if (fread(&vn, sizeof(vn), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&vn); if (vn <= 0) { /* read and don't compare timestamps (we don't care) */ if (fread(&ts, sizeof(ts), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&ts); /* read and skip format description */ for (;;) { if (fread(&k, sizeof(k), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&k); if (k == 0) break; if (fread(str, 1, k, fp) != (size_t) k) { E_ERROR("Failed to read word\n"); return NULL; } } /* read model->ucount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[0] = count; } else { counts[0] = vn; } /* read model->bcount, tcount */ if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[1] = count; if (fread(&count, sizeof(count), 1, fp) != 1) return NULL; if (do_swap) SWAP_INT32(&count); counts[2] = count; E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]); model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); base = &model->base; if (counts[2] > 0) order = 3; else if (counts[1] > 0) order = 2; else order = 1; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); model->trie = lm_trie_create(counts[0], order); unigram_next = (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next)); for (j = 0; j <= (int32) counts[0]; j++) { int32 bigrams; dmp_weight_t weight; /* Skip over the mapping ID, we don't care about it. */ fread(&bigrams, sizeof(int32), 1, fp); /* Read the weights from actual unigram structure. */ fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].prob = weight.f; fread(&weight, sizeof(weight), 1, fp); if (do_swap) SWAP_INT32(&weight.l); weight.f = logmath_log10_to_log_float(lmath, weight.f); model->trie->unigrams[j].bo = weight.f; //store pointer to dmp next to recognize wid fread(&bigrams, sizeof(int32), 1, fp); if (do_swap) SWAP_INT32(&bigrams); model->trie->unigrams[j].next = bigrams; unigram_next[j] = bigrams; } if (order > 1) { raw_ngrams = ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next, do_swap); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } //build reversed trie lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); counts[1]++; //free raw ngrams ngrams_raw_free(raw_ngrams, counts, order); } ckd_free(unigram_next); /* read ascii word strings */ read_word_str(base, fp); fclose_comp(fp, is_pipe); return base; }
ngram_model_t * ngram_model_trie_read_arpa(cmd_ln_t * config, const char *path, logmath_t * lmath) { FILE *fp; lineiter_t *li; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; int32 is_pipe; uint32 counts[NGRAM_MAX_ORDER]; uint32 fixed_counts[NGRAM_MAX_ORDER]; int order; int i; E_INFO("Trying to read LM in arpa format\n"); if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); li = lineiter_start_clean(fp); /* Read n-gram counts from file */ if (read_counts_arpa(&li, counts, &order) == -1) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } E_INFO("LM of order %d\n", order); for (i = 0; i < order; i++) { E_INFO("#%d-grams: %d\n", i + 1, counts[i]); } base = &model->base; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); base->writable = TRUE; model->trie = lm_trie_create(counts[0], order); if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } if (order > 1) { raw_ngrams = ngrams_raw_read_arpa(&li, base->lmath, counts, order, base->wid); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } lm_trie_alloc_ngram(model->trie, fixed_counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); ngrams_raw_free(raw_ngrams, counts, order); } lineiter_free(li); fclose_comp(fp, is_pipe); return base; }
int srch_FLAT_FWD_begin(void *srch) { srch_FLAT_FWD_graph_t *fwg; srch_t *s; kbcore_t *kbc; int32 w, ispipe; char str[1024]; FILE *fp; dict_t *dict; s = (srch_t *) srch; fwg = (srch_FLAT_FWD_graph_t *) s->grh->graph_struct; kbc = s->kbc; dict = kbcore_dict(kbc); assert(fwg); ptmr_reset(&(fwg->tm_hmmeval)); ptmr_reset(&(fwg->tm_hmmtrans)); ptmr_reset(&(fwg->tm_wdtrans)); latticehist_reset(fwg->lathist); /* If input lattice file containing word candidates to be searched specified; use it */ if (fwg->word_cand_dir) { ctl_outfile(str, fwg->word_cand_dir, fwg->latfile_ext, (s->uttfile ? s->uttfile : s->uttid), s->uttid, cmd_ln_boolean_r(kbcore_config(s->kbc), "-build_outdirs")); E_INFO("Reading input lattice: %s\n", str); if ((fp = fopen_compchk(str, &ispipe)) == NULL) E_ERROR("fopen_compchk(%s) failed; running full search\n", str); else { if ((fwg->n_word_cand = word_cand_load(fp, fwg->word_cand, dict, s->uttid)) <= 0) { E_ERROR("Bad or empty lattice file: %s; ignored\n", str); word_cand_free(fwg->word_cand); fwg->n_word_cand = 0; } else E_INFO("%d lattice entries read\n", fwg->n_word_cand); fclose_comp(fp, ispipe); } } if (fwg->n_word_cand > 0) latticehist_n_cand(fwg->lathist) = fwg->n_word_cand; /* Enter all pronunciations of startwid (begin silence) */ fwg->n_frm = -1; for (w = dict->startwid; IS_S3WID(w); w = dict->word[w].alt) word_enter(fwg, w, 0, BAD_S3LATID, dict->word[dict->silwid].ciphone[dict-> word[dict->silwid]. pronlen - 1]); fwg->renormalized = 0; fwg->n_frm = 0; #if 0 E_INFO("After\n"); dump_all_whmm(fwg, fwg->whmm, fwg->n_frm, fwg->n_state, NULL); #endif return SRCH_SUCCESS; }