static int ngrams_raw_read_order(ngram_raw_t ** raw_ngrams, lineiter_t ** li, hash_table_t * wid, logmath_t * lmath, uint32 count, int order, int order_max) { char expected_header[20]; uint32 i; sprintf(expected_header, "\\%d-grams:", order); while (*li && strcmp((*li)->buf, expected_header) != 0) { *li = lineiter_next(*li); } if (*li == NULL) { E_ERROR("Failed to find '%s', language model file truncated\n", expected_header); return -1; } *raw_ngrams = (ngram_raw_t *) ckd_calloc(count, sizeof(ngram_raw_t)); for (i = 0; i < count; i++) { if (read_ngram_instance(li, wid, lmath, order, order_max, &((*raw_ngrams)[i])) < 0) break; } qsort(*raw_ngrams, count, sizeof(ngram_raw_t), &ngram_ord_comparator); return 0; }
/********************************************************************* * * Function: corpus_set_partition * * Description: * This function allows one to specify a set R of a partition of * the corpus into S (roughly) equal sized partitions. * * Function Inputs: * uint32 r - * This argument selects the Rth OF_S sets (R runs from 1..OF_S) * * uint32 of_s - * The number of total (roughly equal sized) sets in the partition. * * Global Inputs: * None * * Return Values: * S3_SUCCESS - Operation completed successfully * S3_ERROR - Operation did not complete successfully * * Global Outputs: * None * *********************************************************************/ int corpus_set_partition(uint32 part, uint32 parts) { uint32 run_len; uint32 n_skip; int lineno = 0; lineiter_t* li; if (ctl_fp == NULL) { E_ERROR("Control file has not been set\n"); return S3_ERROR; } for (li = lineiter_start(ctl_fp); li; li = lineiter_next(li)) { lineno++; } fseek(ctl_fp, 0L, SEEK_SET); li = lineiter_start(ctl_fp); lineiter_free(li); run_len = lineno / parts; n_skip = (part - 1) * run_len; if (part == parts) run_len = UNTIL_EOF; return corpus_set_interval(n_skip, run_len); }
static void evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn) { FILE *fh; lineiter_t *litor; int32 nccs, noovs, nwords, lscr; float64 ch, log_to_log2;; if ((fh = fopen(lsnfn, "r")) == NULL) E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn); /* We have to keep ch in floating-point to avoid overflows, so * we might as well use log2. */ log_to_log2 = log(logmath_get_base(lmath)) / log(2); lscr = nccs = noovs = nwords = 0; ch = 0.0; for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) { char **words; int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr; n = str2words(litor->buf, NULL, 0); if (n < 0) E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n); if (n == 0) /* Do nothing! */ continue; words = ckd_calloc(n, sizeof(*words)); str2words(litor->buf, words, n); /* Remove any utterance ID (FIXME: has to be a single "word") */ if (words[n-1][0] == '(' && words[n-1][strlen(words[n-1])-1] == ')') n = n - 1; tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs, &tmp_lscr); ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2; nccs += tmp_nccs; noovs += tmp_noovs; lscr += tmp_lscr; nwords += n; ckd_free(words); } ch /= (nwords - nccs - noovs); printf("cross-entropy: %f bits\n", ch); /* Calculate perplexity pplx = exp CH */ printf("perplexity: %f\n", pow(2.0, ch)); printf("lm score: %d\n", lscr); /* Report OOVs and CCs */ printf("%d words evaluated\n", nwords); printf("%d OOVs (%.2f%%), %d context cues removed\n", noovs, (double)noovs / nwords * 100, nccs); }
static int open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian) { char nist[7]; lineiter_t *li; FILE *fh; if ((fh = fopen(infile, "rb")) == NULL) { E_ERROR_SYSTEM("Failed to open %s", infile); return -1; } if (fread(&nist, 1, 7, fh) != 7) { E_ERROR_SYSTEM("Failed to read NIST header"); fclose(fh); return -1; } /* Is this actually a NIST file? */ if (0 != strncmp(nist, "NIST_1A", 7)) { fclose(fh); return FALSE; } /* Rewind, parse lines. */ fseek(fh, 0, SEEK_SET); for (li = lineiter_start(fh); li; li = lineiter_next(li)) { char **words; int nword; string_trim(li->buf, STRING_BOTH); if (strlen(li->buf) == 0) { lineiter_free(li); break; } nword = str2words(li->buf, NULL, 0); if (nword != 3) continue; words = (char **)ckd_calloc(nword, sizeof(*words)); str2words(li->buf, words, nword); if (0 == strcmp(words[0], "sample_rate")) { cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2])); } if (0 == strcmp(words[0], "channel_count")) { cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2])); } if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) { cmd_ln_set_str_r(wtf->config, "-input_endian", (0 == strcmp(words[2], "10")) ? "big" : "little"); } ckd_free(words); } fseek(fh, 1024, SEEK_SET); if (out_fh) *out_fh = fh; else fclose(fh); return TRUE; }
/* * Read and return #unigrams, #bigrams, #trigrams as stated in input file. */ static int read_counts_arpa(lineiter_t ** li, uint32 * counts, int *order) { int32 ngram, prev_ngram; uint32 ngram_cnt; /* skip file until past the '\data\' marker */ while (*li) { if (strcmp((*li)->buf, "\\data\\") == 0) break; *li = lineiter_next(*li); } if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) { E_INFO("No \\data\\ mark in LM file\n"); return -1; } prev_ngram = 0; *order = 0; while ((*li = lineiter_next(*li))) { if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2) break; if (ngram != prev_ngram + 1) { E_ERROR ("Ngram counts in LM file is not in order. %d goes after %d\n", ngram, prev_ngram); return -1; } prev_ngram = ngram; counts[*order] = ngram_cnt; (*order)++; } if (*li == NULL) { E_ERROR("EOF while reading ngram counts\n"); return -1; } return 0; }
lineiter_t * lineiter_start_clean(FILE *fh) { lineiter_t *li; li = lineiter_start(fh); if (li == NULL) return li; li->clean = TRUE; if (li->buf && li->buf[0] == '#') { li = lineiter_next(li); } else { string_trim(li->buf, STRING_BOTH); } return li; }
static void ngrams_raw_read_order(ngram_raw_t ** raw_ngrams, lineiter_t ** li, hash_table_t * wid, logmath_t * lmath, uint32 count, int order, int order_max) { char expected_header[20]; uint32 i; sprintf(expected_header, "\\%d-grams:", order); while (*li && strcmp((*li)->buf, expected_header) != 0) { *li = lineiter_next(*li); } *raw_ngrams = (ngram_raw_t *) ckd_calloc(count, sizeof(ngram_raw_t)); for (i = 0; i < count; i++) { read_ngram_instance(li, wid, lmath, order, order_max, &((*raw_ngrams)[i])); } //sort raw ngrams that was read ngram_comparator(NULL, &order); //setting up order in comparator qsort(*raw_ngrams, count, sizeof(ngram_raw_t), &ngram_comparator); }
lineiter_t * lineiter_start(FILE *fh) { lineiter_t *li; li = (lineiter_t *)ckd_calloc(1, sizeof(*li)); li->buf = (char *)ckd_malloc(128); li->buf[0] = '\0'; li->bsiz = 128; li->len = 0; li->fh = fh; li = lineiter_next(li); /* Strip the UTF-8 BOM */ if (li && 0 == strncmp(li->buf, "\xef\xbb\xbf", 3)) { memmove(li->buf, li->buf + 3, strlen(li->buf + 1)); li->len -= 3; } return li; }
int32 model_def_read(model_def_t **out_model_def, const char *file_name) { lineiter_t *li = NULL; uint32 n; char tag[32]; acmod_set_t *acmod_set; uint32 i, j; acmod_id_t acmod_id; uint32 tmat; uint32 n_state; uint32 n_tri; uint32 n_base; uint32 n_total_map; uint32 n_tied_state; uint32 n_tied_ci_state; uint32 n_tied_tmat; uint32 state[MAX_N_STATE]; uint32 n_total; model_def_t *omd; model_def_entry_t *mdef; uint32 *all_state; uint32 max_tmat; uint32 max_state; uint32 max_ci_state; FILE *fp; fp = fopen(file_name, "r"); if (fp == NULL) { E_WARN_SYSTEM("Unable to open %s for reading", file_name); return S3_ERROR; } li = lineiter_start_clean(fp); if (li == NULL) { E_ERROR("ERROR not even a version number in %s!?\n", file_name); fclose(fp); lineiter_free(li); return S3_ERROR; } if (strcmp(li->buf, MODEL_DEF_VERSION) != 0) { E_ERROR("ERROR version(%s) == \"%s\", but expected %s at line %d.\n", file_name, li->buf, MODEL_DEF_VERSION, lineiter_lineno(li)); fclose(fp); if (strcmp(li->buf, "0.1") == 0) { E_ERROR("You must add an attribute field to all the model records. See SPHINX-III File Formats manual\n"); } if (strcmp(li->buf, "0.2") == 0) { E_ERROR("You must add n_tied_state, n_tied_ci_state and n_tied_tmat definitions at the head of the file. See /net/alf19/usr2/eht/s3/cvtmdef.csh\n"); } lineiter_free(li); return S3_ERROR; } n_tri = n_base = n_total_map = n_tied_state = n_tied_ci_state = n_tied_tmat = NO_NUMBER; for ( i = 0; i < 6; i++) { li = lineiter_next(li); if (li == NULL) { E_ERROR("Incomplete count information in %s!?\n", file_name); fclose(fp); lineiter_free(li); return S3_ERROR; } sscanf(li->buf, "%u %s", &n, tag); if (strcmp(tag, "n_base") == 0) { n_base = n; } else if (strcmp(tag, "n_tri") == 0) { n_tri = n; } else if (strcmp(tag, "n_state_map") == 0) { n_total_map = n; } else if (strcmp(tag, "n_tied_state") == 0) { n_tied_state = n; } else if (strcmp(tag, "n_tied_ci_state") == 0) { n_tied_ci_state = n; } else if (strcmp(tag, "n_tied_tmat") == 0) { n_tied_tmat = n; } else { E_ERROR("Unknown tag %s in file at line %d\n", tag, lineiter_lineno(li)); fclose(fp); lineiter_free(li); return S3_ERROR; } } li = lineiter_next(li); *out_model_def = omd = ckd_calloc(1, sizeof(model_def_t)); omd->acmod_set = acmod_set = acmod_set_new(); /* give the acmod_set module some storage allocation requirements */ acmod_set_set_n_ci_hint(acmod_set, n_base); acmod_set_set_n_tri_hint(acmod_set, n_tri); n_total = n_base + n_tri; omd->defn = mdef = ckd_calloc(n_total, sizeof(model_def_entry_t)); omd->n_total_state = n_total_map; all_state = ckd_calloc(n_total_map, sizeof(uint32)); omd->n_tied_ci_state = n_tied_ci_state; omd->n_tied_state = n_tied_state; omd->n_tied_tmat = n_tied_tmat; omd->max_n_state = 0; omd->min_n_state = MAX_N_STATE; for (i = 0, j = 0, max_state = 0, max_ci_state = 0, max_tmat = 0; i < n_base; i++, j += n_state) { n_state = MAX_N_STATE; if (parse_base_line(li->buf, lineiter_lineno(li), &acmod_id, &tmat, state, &n_state, acmod_set) != S3_SUCCESS) { fclose(fp); lineiter_free(li); return S3_ERROR; } mdef[i].p = acmod_id; mdef[i].tmat = tmat; mdef[i].n_state = n_state; mdef[i].state = &all_state[j]; memcpy((char *)mdef[i].state, (const char *)state, n_state * sizeof(uint32)); update_totals(omd, &mdef[i]); li = lineiter_next(li); } for (; i < n_total; i++, j += n_state) { n_state = MAX_N_STATE; if (parse_tri_line(li->buf, lineiter_lineno(li), &acmod_id, &tmat, state, &n_state, acmod_set) != S3_SUCCESS) { fclose(fp); lineiter_free(li); return S3_ERROR; } mdef[i].p = acmod_id; mdef[i].tmat = tmat; mdef[i].n_state = n_state; mdef[i].state = &all_state[j]; memcpy((char *)mdef[i].state, (const char *)state, n_state * sizeof(uint32)); update_totals(omd, &mdef[i]); li = lineiter_next(li); } omd->n_defn = n_total; assert(j == n_total_map); E_INFO("Model definition info:\n"); E_INFO("%u total models defined (%u base, %u tri)\n", omd->n_defn, n_base, n_tri); E_INFO("%u total states\n", omd->n_total_state); E_INFO("%u total tied states\n", omd->n_tied_state); E_INFO("%u total tied CI states\n", omd->n_tied_ci_state); E_INFO("%u total tied transition matrices\n", omd->n_tied_tmat); E_INFO("%u max state/model\n", omd->max_n_state); E_INFO("%u min state/model\n", omd->min_n_state); fclose(fp); lineiter_free(li); return S3_SUCCESS; }
static void read_ngram_instance(lineiter_t ** li, hash_table_t * wid, logmath_t * lmath, int order, int order_max, ngram_raw_t * raw_ngram) { int n; int words_expected; int i; char *wptr[NGRAM_MAX_ORDER + 1]; uint32 *word_out; *li = lineiter_next(*li); if (*li == NULL) { E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n", order); return; } string_trim((*li)->buf, STRING_BOTH); words_expected = order + 1; if ((n = str2words((*li)->buf, wptr, NGRAM_MAX_ORDER + 1)) < words_expected) { if ((*li)->buf[0] != '\0') { E_WARN("Format error; %d-gram ignored: %s\n", order, (*li)->buf); } } else { if (order == order_max) { raw_ngram->weights = (float *) ckd_calloc(1, sizeof(*raw_ngram->weights)); raw_ngram->weights[0] = atof_c(wptr[0]); if (raw_ngram->weights[0] > 0) { E_WARN("%d-gram [%s] has positive probability. Zeroize\n", order, wptr[1]); raw_ngram->weights[0] = 0.0f; } raw_ngram->weights[0] = logmath_log10_to_log_float(lmath, raw_ngram->weights[0]); } else { float weight, backoff; raw_ngram->weights = (float *) ckd_calloc(2, sizeof(*raw_ngram->weights)); weight = atof_c(wptr[0]); if (weight > 0) { E_WARN("%d-gram [%s] has positive probability. Zeroize\n", order, wptr[1]); raw_ngram->weights[0] = 0.0f; } else { raw_ngram->weights[0] = logmath_log10_to_log_float(lmath, weight); } if (n == order + 1) { raw_ngram->weights[1] = 0.0f; } else { backoff = atof_c(wptr[order + 1]); raw_ngram->weights[1] = logmath_log10_to_log_float(lmath, backoff); } } raw_ngram->words = (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words)); for (word_out = raw_ngram->words + order - 1, i = 1; word_out >= raw_ngram->words; --word_out, i++) { hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out); } } }
dtree_t * read_final_tree(FILE *fp, pset_t *pset, uint32 n_pset) { dtree_t *out; dtree_node_t *node; uint32 n_node; char *s, str[128]; lineiter_t *ln = NULL; uint32 n_scan; uint32 i, node_id, node_id_y, node_id_n; comp_quest_t *q; float64 ent; float32 occ; int err; out = ckd_calloc(1, sizeof(dtree_t)); ln = lineiter_start_clean(fp); s = ln->buf; sscanf(s, "%s%n", str, &n_scan); if (strcmp(str, "n_node") == 0) { s += n_scan; sscanf(s, "%u", &n_node); } else { E_FATAL("Format error; expecting n_node\n"); } out->n_node = n_node; out->node = node = ckd_calloc(n_node, sizeof(dtree_node_t)); for (i = 0; i < n_node; i++) node[i].node_id = i; err = FALSE; while ((ln = lineiter_next(ln))) { s = ln->buf; sscanf(s, "%u%n", &node_id, &n_scan); s += n_scan; sscanf(s, "%s%n", str, &n_scan); s += n_scan; if (strcmp(str, "-") == 0) { node_id_y = NO_ID; } else { node_id_y = atoi(str); } sscanf(s, "%s%n", str, &n_scan); s += n_scan; if (strcmp(str, "-") == 0) { node_id_n = NO_ID; } else { node_id_n = atoi(str); } sscanf(s, "%le%n", &ent, &n_scan); s += n_scan; sscanf(s, "%e%n", &occ, &n_scan); s += n_scan; if ((node_id_y != NO_ID) && (node_id_y != NO_ID)) { q = (comp_quest_t *)ckd_calloc(1, sizeof(comp_quest_t)); if (s3parse_comp_quest(pset, n_pset, q, s) != S3_SUCCESS) { err = TRUE; } node[node_id].q = q; } else node[node_id].q = NULL; /* ck if internal node */ if ((node_id_y != NO_ID) && (node_id_y != NO_ID)) node[node_id].wt_ent_dec = ent; else node[node_id].wt_ent = ent; node[node_id].occ = occ; if ((node_id_y != NO_ID) && (node_id_y != NO_ID)) { node[node_id].y = &node[node_id_y]; node[node_id].n = &node[node_id_n]; node[node_id_y].p = node[node_id_n].p = &node[node_id]; } else { node[node_id].y = NULL; node[node_id].n = NULL; } } if (err == TRUE) { free_tree(out); out = NULL; } lineiter_free(ln); return out; }
dict_t * dict_init(cmd_ln_t *config, bin_mdef_t * mdef) { FILE *fp, *fp2; int32 n; lineiter_t *li; dict_t *d; s3cipid_t sil; char const *dictfile = NULL, *fillerfile = NULL; if (config) { dictfile = cmd_ln_str_r(config, "-dict"); fillerfile = cmd_ln_str_r(config, "-fdict"); } /* * First obtain #words in dictionary (for hash table allocation). * Reason: The PC NT system doesn't like to grow memory gradually. Better to allocate * all the required memory in one go. */ fp = NULL; n = 0; if (dictfile) { if ((fp = fopen(dictfile, "r")) == NULL) E_FATAL_SYSTEM("Failed to open dictionary file '%s' for reading", dictfile); for (li = lineiter_start(fp); li; li = lineiter_next(li)) { if (li->buf[0] != '#') n++; } rewind(fp); } fp2 = NULL; if (fillerfile) { if ((fp2 = fopen(fillerfile, "r")) == NULL) E_FATAL_SYSTEM("Failed to open filler dictionary file '%s' for reading", fillerfile); for (li = lineiter_start(fp2); li; li = lineiter_next(li)) { if (li->buf[0] != '#') n++; } rewind(fp2); } /* * Allocate dict entries. HACK!! Allow some extra entries for words not in file. * Also check for type size restrictions. */ d = (dict_t *) ckd_calloc(1, sizeof(dict_t)); /* freed in dict_free() */ d->refcnt = 1; d->max_words = (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID; if (n >= MAX_S3WID) E_FATAL("#Words in dictionaries (%d) exceeds limit (%d)\n", n, MAX_S3WID); E_INFO("Allocating %d * %d bytes (%d KiB) for word entries\n", d->max_words, sizeof(dictword_t), d->max_words * sizeof(dictword_t) / 1024); d->word = (dictword_t *) ckd_calloc(d->max_words, sizeof(dictword_t)); /* freed in dict_free() */ d->n_word = 0; if (mdef) d->mdef = bin_mdef_retain(mdef); /* Create new hash table for word strings; case-insensitive word strings */ if (config && cmd_ln_exists_r(config, "-dictcase")) d->nocase = cmd_ln_boolean_r(config, "-dictcase"); d->ht = hash_table_new(d->max_words, d->nocase); /* Digest main dictionary file */ if (fp) { E_INFO("Reading main dictionary: %s\n", dictfile); dict_read(fp, d); fclose(fp); E_INFO("%d words read\n", d->n_word); } /* Now the filler dictionary file, if it exists */ d->filler_start = d->n_word; if (fillerfile) { E_INFO("Reading filler dictionary: %s\n", fillerfile); dict_read(fp2, d); fclose(fp2); E_INFO("%d words read\n", d->n_word - d->filler_start); } if (mdef) sil = bin_mdef_silphone(mdef); else sil = 0; if (dict_wordid(d, S3_START_WORD) == BAD_S3WID) { dict_add_word(d, S3_START_WORD, &sil, 1); } if (dict_wordid(d, S3_FINISH_WORD) == BAD_S3WID) { dict_add_word(d, S3_FINISH_WORD, &sil, 1); } if (dict_wordid(d, S3_SILENCE_WORD) == BAD_S3WID) { dict_add_word(d, S3_SILENCE_WORD, &sil, 1); } d->filler_end = d->n_word - 1; /* Initialize distinguished word-ids */ d->startwid = dict_wordid(d, S3_START_WORD); d->finishwid = dict_wordid(d, S3_FINISH_WORD); d->silwid = dict_wordid(d, S3_SILENCE_WORD); if ((d->filler_start > d->filler_end) || (!dict_filler_word(d, d->silwid))) E_FATAL("%s must occur (only) in filler dictionary\n", S3_SILENCE_WORD); /* No check that alternative pronunciations for filler words are in filler range!! */ return d; }
static int32 dict_read(FILE * fp, dict_t * d) { lineiter_t *li; char **wptr; s3cipid_t *p; int32 lineno, nwd; s3wid_t w; int32 i, maxwd; size_t stralloc, phnalloc; maxwd = 512; p = (s3cipid_t *) ckd_calloc(maxwd + 4, sizeof(*p)); wptr = (char **) ckd_calloc(maxwd, sizeof(char *)); /* Freed below */ lineno = 0; stralloc = phnalloc = 0; for (li = lineiter_start(fp); li; li = lineiter_next(li)) { lineno++; if (0 == strncmp(li->buf, "##", 2) || 0 == strncmp(li->buf, ";;", 2)) continue; if ((nwd = str2words(li->buf, wptr, maxwd)) < 0) { /* Increase size of p, wptr. */ nwd = str2words(li->buf, NULL, 0); assert(nwd > maxwd); /* why else would it fail? */ maxwd = nwd; p = (s3cipid_t *) ckd_realloc(p, (maxwd + 4) * sizeof(*p)); wptr = (char **) ckd_realloc(wptr, maxwd * sizeof(*wptr)); } if (nwd == 0) /* Empty line */ continue; /* wptr[0] is the word-string and wptr[1..nwd-1] the pronunciation sequence */ if (nwd == 1) { E_ERROR("Line %d: No pronunciation for word %s; ignored\n", lineno, wptr[0]); continue; } /* Convert pronunciation string to CI-phone-ids */ for (i = 1; i < nwd; i++) { p[i - 1] = dict_ciphone_id(d, wptr[i]); if (NOT_S3CIPID(p[i - 1])) { E_ERROR("Line %d: Bad ciphone: %s; word %s ignored\n", lineno, wptr[i], wptr[0]); break; } } if (i == nwd) { /* All CI-phones successfully converted to IDs */ w = dict_add_word(d, wptr[0], p, nwd - 1); if (NOT_S3WID(w)) E_ERROR ("Line %d: dict_add_word (%s) failed (duplicate?); ignored\n", lineno, wptr[0]); else { stralloc += strlen(d->word[w].word); phnalloc += d->word[w].pronlen * sizeof(s3cipid_t); } } } E_INFO("Allocated %d KiB for strings, %d KiB for phones\n", (int)stralloc / 1024, (int)phnalloc / 1024); ckd_free(p); ckd_free(wptr); return 0; }
static int read_ngram_instance(lineiter_t ** li, hash_table_t * wid, logmath_t * lmath, int order, int order_max, ngram_raw_t * raw_ngram) { int n; int words_expected; int i; char *wptr[NGRAM_MAX_ORDER + 1]; uint32 *word_out; if (*li) *li = lineiter_next(*li); if (*li == NULL) { E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n", order); return -1; } words_expected = order + 1; if ((n = str2words((*li)->buf, wptr, NGRAM_MAX_ORDER + 1)) < words_expected) { E_ERROR("Format error; %d-gram ignored: %s\n", order, (*li)->buf); return -1; } raw_ngram->order = order; if (order == order_max) { raw_ngram->prob = atof_c(wptr[0]); if (raw_ngram->prob > 0) { E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]); raw_ngram->prob = 0.0f; } raw_ngram->prob = logmath_log10_to_log_float(lmath, raw_ngram->prob); } else { float weight, backoff; weight = atof_c(wptr[0]); if (weight > 0) { E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]); raw_ngram->prob = 0.0f; } else { raw_ngram->prob = logmath_log10_to_log_float(lmath, weight); } if (n == order + 1) { raw_ngram->backoff = 0.0f; } else { backoff = atof_c(wptr[order + 1]); raw_ngram->backoff = logmath_log10_to_log_float(lmath, backoff); } } raw_ngram->words = (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words)); for (word_out = raw_ngram->words + order - 1, i = 1; word_out >= raw_ngram->words; --word_out, i++) { hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out); } return 0; }
static int run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile) { hash_table_t *files; hash_iter_t *itor; lineiter_t *li; FILE *ctlfh; int nskip, runlen, npart, rv = 0; if ((ctlfh = fopen(ctlfile, "r")) == NULL) { E_ERROR_SYSTEM("Failed to open control file %s", ctlfile); return -1; } nskip = cmd_ln_int32_r(wtf->config, "-nskip"); runlen = cmd_ln_int32_r(wtf->config, "-runlen"); if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) { /* Count lines in the file. */ int partlen, part, nlines = 0; part = cmd_ln_int32_r(wtf->config, "-part"); for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) ++nlines; fseek(ctlfh, 0, SEEK_SET); partlen = nlines / npart; nskip = partlen * (part - 1); if (part == npart) runlen = -1; else runlen = partlen; } if (runlen != -1){ E_INFO("Processing %d utterances at position %d\n", runlen, nskip); files = hash_table_new(runlen, HASH_CASE_YES); } else { E_INFO("Processing all remaining utterances at position %d\n", nskip); files = hash_table_new(1000, HASH_CASE_YES); } for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) { char *c, *infile, *outfile; if (nskip-- > 0) continue; if (runlen == 0) { lineiter_free(li); break; } --runlen; string_trim(li->buf, STRING_BOTH); /* Extract the file ID from the control line. */ if ((c = strchr(li->buf, ' ')) != NULL) *c = '\0'; if (strlen(li->buf) == 0) { E_WARN("Empty line %d in control file, skipping\n", li->lineno); continue; } build_filenames(wtf->config, li->buf, &infile, &outfile); if (hash_table_lookup(files, infile, NULL) == 0) continue; rv = sphinx_wave2feat_convert_file(wtf, infile, outfile); hash_table_enter(files, infile, outfile); if (rv != 0) { lineiter_free(li); break; } } for (itor = hash_table_iter(files); itor; itor = hash_table_iter_next(itor)) { ckd_free((void *)hash_entry_key(itor->ent)); ckd_free(hash_entry_val(itor->ent)); } hash_table_free(files); if (fclose(ctlfh) == EOF) E_ERROR_SYSTEM("Failed to close control file"); return rv; }
static int read_1grams_arpa(lineiter_t ** li, uint32 count, ngram_model_t * base, unigram_t * unigrams) { uint32 i; int n; int n_parts; char *wptr[3]; while (*li && strcmp((*li)->buf, "\\1-grams:") != 0) { *li = lineiter_next(*li); } if (*li == NULL) { E_ERROR_SYSTEM("Failed to read \\1-grams: mark"); return -1; } n_parts = 2; for (i = 0; i < count; i++) { *li = lineiter_next(*li); if (*li == NULL) { E_ERROR ("Unexpected end of ARPA file. Failed to read %dth unigram\n", i + 1); return -1; } if ((n = str2words((*li)->buf, wptr, 3)) < n_parts) { E_ERROR("Format error at line %s, Failed to read unigrams\n", (*li)->buf); return -1; } unigram_t *unigram = &unigrams[i]; unigram->prob = logmath_log10_to_log_float(base->lmath, atof_c(wptr[0])); if (unigram->prob > 0) { E_WARN("Unigram '%s' has positive probability\n", wptr[1]); unigram->prob = 0; } if (n == n_parts + 1) { unigram->bo = logmath_log10_to_log_float(base->lmath, atof_c(wptr[2])); } else { unigram->bo = 0.0f; } /* TODO: classify float with fpclassify and warn if bad value occurred */ base->word_str[i] = ckd_salloc(wptr[1]); } /* fill hash-table that maps unigram names to their word ids */ for (i = 0; i < count; i++) { if ((hash_table_enter (base->wid, base->word_str[i], (void *) (long) i)) != (void *) (long) i) { E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); } } return 0; }
int batch_decoder_run(batch_decoder_t *bd) { int32 ctloffset, ctlcount, ctlincr; lineiter_t *li, *ali = NULL; search_run(bd->fwdtree); search_run(bd->fwdflat); ctloffset = cmd_ln_int32_r(bd->config, "-ctloffset"); ctlcount = cmd_ln_int32_r(bd->config, "-ctlcount"); ctlincr = cmd_ln_int32_r(bd->config, "-ctlincr"); if (bd->alignfh) ali = lineiter_start(bd->alignfh); for (li = lineiter_start(bd->ctlfh); li; li = lineiter_next(li)) { alignment_t *al = NULL; char *wptr[4]; int32 nf, sf, ef; if (li->lineno < ctloffset) { if (ali) ali = lineiter_next(ali); continue; } if ((li->lineno - ctloffset) % ctlincr != 0) { if (ali) ali = lineiter_next(ali); continue; } if (ctlcount != -1 && li->lineno >= ctloffset + ctlcount) break; if (ali) al = parse_alignment(ali->buf, search_factory_d2p(bd->sf)); sf = 0; ef = -1; nf = str2words(li->buf, wptr, 4); if (nf == 0) { /* Do nothing. */ } else if (nf < 0) { E_ERROR("Unexpected extra data in control file at line %d\n", li->lineno); } else { char *file, *uttid; file = wptr[0]; uttid = NULL; if (nf > 1) sf = atoi(wptr[1]); if (nf > 2) ef = atoi(wptr[2]); if (nf > 3) uttid = wptr[3]; /* Do actual decoding. */ batch_decoder_decode(bd, file, uttid, sf, ef, al); } alignment_free(al); if (ali) ali = lineiter_next(ali); } featbuf_producer_shutdown(search_factory_featbuf(bd->sf)); return 0; }
/********************************************************************* * * Function: * topo_read * * Description: * This routine reads an ASCII transition matrix which may then be * used to determine the topology of the models used in the system. * * Traceability: * * Function Inputs: * * Global Inputs: * None * * Return Values: * S3_SUCCESS is returned upon successful completion * S3_ERROR is returned upon an error condition * * Global Outputs: * None * * Errors: * * Pre-Conditions: * * Post-Conditions: * * Design: * * Notes: * *********************************************************************/ int32 topo_read(float32 ***tmat, uint32 *n_state_pm, const char *topo_file_name) { float32 **out; FILE *fp; lineiter_t *li = NULL; uint32 n_state; uint32 i, j; float32 row_sum; assert(topo_file_name != NULL); fp = fopen(topo_file_name, "r"); if (fp == NULL) { E_ERROR_SYSTEM("Unable to open %s for reading\n", topo_file_name); goto error; } li = lineiter_start_clean(fp); if (li == NULL) { E_ERROR("EOF encounted while reading version number in %s!?\n", topo_file_name); goto error; } if (strcmp(li->buf, TOPO_FILE_VERSION) != 0) { E_ERROR("Topo file version in %s is %s. Expected %s\n", topo_file_name, li->buf, TOPO_FILE_VERSION); goto error; } li = lineiter_next(li); if (li == NULL) { E_ERROR("EOF encountered while reading n_state in %s!?\n", topo_file_name); goto error; } sscanf(li->buf, "%d\n", &n_state); /* Support Request 1504066: robust reading of topo file in SphinxTrain When user put 0.1 1.0 1.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 instead of 0.1 4 1.0 1.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 topo_read will misread 1.0 into n_state as 1. And the generated transition matrix will corrupt bw as well. This problem is now fixed. */ if(n_state==1) { E_ERROR("n_state =1, if you are using a transition matrix with more than 1 state, this error might show that there is format issue in your input topology file. You are recommended to use perl/make_topology.pl to generate the topo file instead.\n"); goto error; } out = (float **)ckd_calloc_2d(n_state-1, n_state, sizeof(float32)); for (i = 0; i < n_state-1; i++) { row_sum = 0.0; for (j = 0; j < n_state; j++) { fscanf(fp, "%f", &out[i][j]); row_sum += out[i][j]; } for (j = 0; j < n_state; j++) { out[i][j] /= row_sum; } } *tmat = out; *n_state_pm = n_state; fclose(fp); lineiter_free(li); return S3_SUCCESS; error: if (fp) fclose(fp); lineiter_free(li); return S3_ERROR; }