/********************************************************************* * * Function: corpus_set_partition * * Description: * This function allows one to specify a set R of a partition of * the corpus into S (roughly) equal sized partitions. * * Function Inputs: * uint32 r - * This argument selects the Rth OF_S sets (R runs from 1..OF_S) * * uint32 of_s - * The number of total (roughly equal sized) sets in the partition. * * Global Inputs: * None * * Return Values: * S3_SUCCESS - Operation completed successfully * S3_ERROR - Operation did not complete successfully * * Global Outputs: * None * *********************************************************************/ int corpus_set_partition(uint32 part, uint32 parts) { uint32 run_len; uint32 n_skip; int lineno = 0; lineiter_t* li; if (ctl_fp == NULL) { E_ERROR("Control file has not been set\n"); return S3_ERROR; } for (li = lineiter_start(ctl_fp); li; li = lineiter_next(li)) { lineno++; } fseek(ctl_fp, 0L, SEEK_SET); li = lineiter_start(ctl_fp); lineiter_free(li); run_len = lineno / parts; n_skip = (part - 1) * run_len; if (part == parts) run_len = UNTIL_EOF; return corpus_set_interval(n_skip, run_len); }
static void evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn) { FILE *fh; lineiter_t *litor; int32 nccs, noovs, nwords, lscr; float64 ch, log_to_log2;; if ((fh = fopen(lsnfn, "r")) == NULL) E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn); /* We have to keep ch in floating-point to avoid overflows, so * we might as well use log2. */ log_to_log2 = log(logmath_get_base(lmath)) / log(2); lscr = nccs = noovs = nwords = 0; ch = 0.0; for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) { char **words; int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr; n = str2words(litor->buf, NULL, 0); if (n < 0) E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n); if (n == 0) /* Do nothing! */ continue; words = ckd_calloc(n, sizeof(*words)); str2words(litor->buf, words, n); /* Remove any utterance ID (FIXME: has to be a single "word") */ if (words[n-1][0] == '(' && words[n-1][strlen(words[n-1])-1] == ')') n = n - 1; tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs, &tmp_lscr); ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2; nccs += tmp_nccs; noovs += tmp_noovs; lscr += tmp_lscr; nwords += n; ckd_free(words); } ch /= (nwords - nccs - noovs); printf("cross-entropy: %f bits\n", ch); /* Calculate perplexity pplx = exp CH */ printf("perplexity: %f\n", pow(2.0, ch)); printf("lm score: %d\n", lscr); /* Report OOVs and CCs */ printf("%d words evaluated\n", nwords); printf("%d OOVs (%.2f%%), %d context cues removed\n", noovs, (double)noovs / nwords * 100, nccs); }
static int open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian) { char nist[7]; lineiter_t *li; FILE *fh; if ((fh = fopen(infile, "rb")) == NULL) { E_ERROR_SYSTEM("Failed to open %s", infile); return -1; } if (fread(&nist, 1, 7, fh) != 7) { E_ERROR_SYSTEM("Failed to read NIST header"); fclose(fh); return -1; } /* Is this actually a NIST file? */ if (0 != strncmp(nist, "NIST_1A", 7)) { fclose(fh); return FALSE; } /* Rewind, parse lines. */ fseek(fh, 0, SEEK_SET); for (li = lineiter_start(fh); li; li = lineiter_next(li)) { char **words; int nword; string_trim(li->buf, STRING_BOTH); if (strlen(li->buf) == 0) { lineiter_free(li); break; } nword = str2words(li->buf, NULL, 0); if (nword != 3) continue; words = (char **)ckd_calloc(nword, sizeof(*words)); str2words(li->buf, words, nword); if (0 == strcmp(words[0], "sample_rate")) { cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2])); } if (0 == strcmp(words[0], "channel_count")) { cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2])); } if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) { cmd_ln_set_str_r(wtf->config, "-input_endian", (0 == strcmp(words[2], "10")) ? "big" : "little"); } ckd_free(words); } fseek(fh, 1024, SEEK_SET); if (out_fh) *out_fh = fh; else fclose(fh); return TRUE; }
lineiter_t * lineiter_start_clean(FILE *fh) { lineiter_t *li; li = lineiter_start(fh); if (li == NULL) return li; li->clean = TRUE; if (li->buf && li->buf[0] == '#') { li = lineiter_next(li); } else { string_trim(li->buf, STRING_BOTH); } return li; }
dict_t * dict_init(cmd_ln_t *config, bin_mdef_t * mdef) { FILE *fp, *fp2; int32 n; lineiter_t *li; dict_t *d; s3cipid_t sil; char const *dictfile = NULL, *fillerfile = NULL; if (config) { dictfile = cmd_ln_str_r(config, "-dict"); fillerfile = cmd_ln_str_r(config, "-fdict"); } /* * First obtain #words in dictionary (for hash table allocation). * Reason: The PC NT system doesn't like to grow memory gradually. Better to allocate * all the required memory in one go. */ fp = NULL; n = 0; if (dictfile) { if ((fp = fopen(dictfile, "r")) == NULL) E_FATAL_SYSTEM("Failed to open dictionary file '%s' for reading", dictfile); for (li = lineiter_start(fp); li; li = lineiter_next(li)) { if (li->buf[0] != '#') n++; } rewind(fp); } fp2 = NULL; if (fillerfile) { if ((fp2 = fopen(fillerfile, "r")) == NULL) E_FATAL_SYSTEM("Failed to open filler dictionary file '%s' for reading", fillerfile); for (li = lineiter_start(fp2); li; li = lineiter_next(li)) { if (li->buf[0] != '#') n++; } rewind(fp2); } /* * Allocate dict entries. HACK!! Allow some extra entries for words not in file. * Also check for type size restrictions. */ d = (dict_t *) ckd_calloc(1, sizeof(dict_t)); /* freed in dict_free() */ d->refcnt = 1; d->max_words = (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID; if (n >= MAX_S3WID) E_FATAL("#Words in dictionaries (%d) exceeds limit (%d)\n", n, MAX_S3WID); E_INFO("Allocating %d * %d bytes (%d KiB) for word entries\n", d->max_words, sizeof(dictword_t), d->max_words * sizeof(dictword_t) / 1024); d->word = (dictword_t *) ckd_calloc(d->max_words, sizeof(dictword_t)); /* freed in dict_free() */ d->n_word = 0; if (mdef) d->mdef = bin_mdef_retain(mdef); /* Create new hash table for word strings; case-insensitive word strings */ if (config && cmd_ln_exists_r(config, "-dictcase")) d->nocase = cmd_ln_boolean_r(config, "-dictcase"); d->ht = hash_table_new(d->max_words, d->nocase); /* Digest main dictionary file */ if (fp) { E_INFO("Reading main dictionary: %s\n", dictfile); dict_read(fp, d); fclose(fp); E_INFO("%d words read\n", d->n_word); } /* Now the filler dictionary file, if it exists */ d->filler_start = d->n_word; if (fillerfile) { E_INFO("Reading filler dictionary: %s\n", fillerfile); dict_read(fp2, d); fclose(fp2); E_INFO("%d words read\n", d->n_word - d->filler_start); } if (mdef) sil = bin_mdef_silphone(mdef); else sil = 0; if (dict_wordid(d, S3_START_WORD) == BAD_S3WID) { dict_add_word(d, S3_START_WORD, &sil, 1); } if (dict_wordid(d, S3_FINISH_WORD) == BAD_S3WID) { dict_add_word(d, S3_FINISH_WORD, &sil, 1); } if (dict_wordid(d, S3_SILENCE_WORD) == BAD_S3WID) { dict_add_word(d, S3_SILENCE_WORD, &sil, 1); } d->filler_end = d->n_word - 1; /* Initialize distinguished word-ids */ d->startwid = dict_wordid(d, S3_START_WORD); d->finishwid = dict_wordid(d, S3_FINISH_WORD); d->silwid = dict_wordid(d, S3_SILENCE_WORD); if ((d->filler_start > d->filler_end) || (!dict_filler_word(d, d->silwid))) E_FATAL("%s must occur (only) in filler dictionary\n", S3_SILENCE_WORD); /* No check that alternative pronunciations for filler words are in filler range!! */ return d; }
static int32 dict_read(FILE * fp, dict_t * d) { lineiter_t *li; char **wptr; s3cipid_t *p; int32 lineno, nwd; s3wid_t w; int32 i, maxwd; size_t stralloc, phnalloc; maxwd = 512; p = (s3cipid_t *) ckd_calloc(maxwd + 4, sizeof(*p)); wptr = (char **) ckd_calloc(maxwd, sizeof(char *)); /* Freed below */ lineno = 0; stralloc = phnalloc = 0; for (li = lineiter_start(fp); li; li = lineiter_next(li)) { lineno++; if (0 == strncmp(li->buf, "##", 2) || 0 == strncmp(li->buf, ";;", 2)) continue; if ((nwd = str2words(li->buf, wptr, maxwd)) < 0) { /* Increase size of p, wptr. */ nwd = str2words(li->buf, NULL, 0); assert(nwd > maxwd); /* why else would it fail? */ maxwd = nwd; p = (s3cipid_t *) ckd_realloc(p, (maxwd + 4) * sizeof(*p)); wptr = (char **) ckd_realloc(wptr, maxwd * sizeof(*wptr)); } if (nwd == 0) /* Empty line */ continue; /* wptr[0] is the word-string and wptr[1..nwd-1] the pronunciation sequence */ if (nwd == 1) { E_ERROR("Line %d: No pronunciation for word %s; ignored\n", lineno, wptr[0]); continue; } /* Convert pronunciation string to CI-phone-ids */ for (i = 1; i < nwd; i++) { p[i - 1] = dict_ciphone_id(d, wptr[i]); if (NOT_S3CIPID(p[i - 1])) { E_ERROR("Line %d: Bad ciphone: %s; word %s ignored\n", lineno, wptr[i], wptr[0]); break; } } if (i == nwd) { /* All CI-phones successfully converted to IDs */ w = dict_add_word(d, wptr[0], p, nwd - 1); if (NOT_S3WID(w)) E_ERROR ("Line %d: dict_add_word (%s) failed (duplicate?); ignored\n", lineno, wptr[0]); else { stralloc += strlen(d->word[w].word); phnalloc += d->word[w].pronlen * sizeof(s3cipid_t); } } } E_INFO("Allocated %d KiB for strings, %d KiB for phones\n", (int)stralloc / 1024, (int)phnalloc / 1024); ckd_free(p); ckd_free(wptr); return 0; }
static int run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile) { hash_table_t *files; hash_iter_t *itor; lineiter_t *li; FILE *ctlfh; int nskip, runlen, npart, rv = 0; if ((ctlfh = fopen(ctlfile, "r")) == NULL) { E_ERROR_SYSTEM("Failed to open control file %s", ctlfile); return -1; } nskip = cmd_ln_int32_r(wtf->config, "-nskip"); runlen = cmd_ln_int32_r(wtf->config, "-runlen"); if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) { /* Count lines in the file. */ int partlen, part, nlines = 0; part = cmd_ln_int32_r(wtf->config, "-part"); for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) ++nlines; fseek(ctlfh, 0, SEEK_SET); partlen = nlines / npart; nskip = partlen * (part - 1); if (part == npart) runlen = -1; else runlen = partlen; } if (runlen != -1){ E_INFO("Processing %d utterances at position %d\n", runlen, nskip); files = hash_table_new(runlen, HASH_CASE_YES); } else { E_INFO("Processing all remaining utterances at position %d\n", nskip); files = hash_table_new(1000, HASH_CASE_YES); } for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) { char *c, *infile, *outfile; if (nskip-- > 0) continue; if (runlen == 0) { lineiter_free(li); break; } --runlen; string_trim(li->buf, STRING_BOTH); /* Extract the file ID from the control line. */ if ((c = strchr(li->buf, ' ')) != NULL) *c = '\0'; if (strlen(li->buf) == 0) { E_WARN("Empty line %d in control file, skipping\n", li->lineno); continue; } build_filenames(wtf->config, li->buf, &infile, &outfile); if (hash_table_lookup(files, infile, NULL) == 0) continue; rv = sphinx_wave2feat_convert_file(wtf, infile, outfile); hash_table_enter(files, infile, outfile); if (rv != 0) { lineiter_free(li); break; } } for (itor = hash_table_iter(files); itor; itor = hash_table_iter_next(itor)) { ckd_free((void *)hash_entry_key(itor->ent)); ckd_free(hash_entry_val(itor->ent)); } hash_table_free(files); if (fclose(ctlfh) == EOF) E_ERROR_SYSTEM("Failed to close control file"); return rv; }
int batch_decoder_run(batch_decoder_t *bd) { int32 ctloffset, ctlcount, ctlincr; lineiter_t *li, *ali = NULL; search_run(bd->fwdtree); search_run(bd->fwdflat); ctloffset = cmd_ln_int32_r(bd->config, "-ctloffset"); ctlcount = cmd_ln_int32_r(bd->config, "-ctlcount"); ctlincr = cmd_ln_int32_r(bd->config, "-ctlincr"); if (bd->alignfh) ali = lineiter_start(bd->alignfh); for (li = lineiter_start(bd->ctlfh); li; li = lineiter_next(li)) { alignment_t *al = NULL; char *wptr[4]; int32 nf, sf, ef; if (li->lineno < ctloffset) { if (ali) ali = lineiter_next(ali); continue; } if ((li->lineno - ctloffset) % ctlincr != 0) { if (ali) ali = lineiter_next(ali); continue; } if (ctlcount != -1 && li->lineno >= ctloffset + ctlcount) break; if (ali) al = parse_alignment(ali->buf, search_factory_d2p(bd->sf)); sf = 0; ef = -1; nf = str2words(li->buf, wptr, 4); if (nf == 0) { /* Do nothing. */ } else if (nf < 0) { E_ERROR("Unexpected extra data in control file at line %d\n", li->lineno); } else { char *file, *uttid; file = wptr[0]; uttid = NULL; if (nf > 1) sf = atoi(wptr[1]); if (nf > 2) ef = atoi(wptr[2]); if (nf > 3) uttid = wptr[3]; /* Do actual decoding. */ batch_decoder_decode(bd, file, uttid, sf, ef, al); } alignment_free(al); if (ali) ali = lineiter_next(ali); } featbuf_producer_shutdown(search_factory_featbuf(bd->sf)); return 0; }