static lineiter_t * lineiter_next_plain(lineiter_t *li) { /* We are reading the next line */ li->lineno++; /* Read a line and check for EOF. */ if (fgets(li->buf, li->bsiz, li->fh) == NULL) { lineiter_free(li); return NULL; } /* If we managed to read the whole thing, then we are done * (this will be by far the most common result). */ li->len = strlen(li->buf); if (li->len < li->bsiz - 1 || li->buf[li->len - 1] == '\n') return li; /* Otherwise we have to reallocate and keep going. */ while (1) { li->bsiz *= 2; li->buf = (char *)ckd_realloc(li->buf, li->bsiz); /* If we get an EOF, we are obviously done. */ if (fgets(li->buf + li->len, li->bsiz - li->len, li->fh) == NULL) { li->len += strlen(li->buf + li->len); return li; } li->len += strlen(li->buf + li->len); /* If we managed to read the whole thing, then we are done. */ if (li->len < li->bsiz - 1 || li->buf[li->len - 1] == '\n') return li; } /* Shouldn't get here. */ return li; }
int corpus_reset() { lineiter_t* li; n_run = UNTIL_EOF; assert(ctl_fp); fseek(ctl_fp, 0L, SEEK_SET); if (transcription_fp) fseek(transcription_fp, 0L, SEEK_SET); li = lineiter_start_clean(ctl_fp); if (li == NULL) { E_ERROR("Must be at least one line in the control file\n"); return S3_ERROR; } parse_ctl_line(li->buf, &next_ctl_path, &next_ctl_sf, &next_ctl_ef, &next_ctl_utt_id); lineiter_free (li); corpus_set_interval(sv_n_skip, sv_run_len); return S3_SUCCESS; }
/********************************************************************* * * Function: corpus_set_partition * * Description: * This function allows one to specify a set R of a partition of * the corpus into S (roughly) equal sized partitions. * * Function Inputs: * uint32 r - * This argument selects the Rth OF_S sets (R runs from 1..OF_S) * * uint32 of_s - * The number of total (roughly equal sized) sets in the partition. * * Global Inputs: * None * * Return Values: * S3_SUCCESS - Operation completed successfully * S3_ERROR - Operation did not complete successfully * * Global Outputs: * None * *********************************************************************/ int corpus_set_partition(uint32 part, uint32 parts) { uint32 run_len; uint32 n_skip; int lineno = 0; lineiter_t* li; if (ctl_fp == NULL) { E_ERROR("Control file has not been set\n"); return S3_ERROR; } for (li = lineiter_start(ctl_fp); li; li = lineiter_next(li)) { lineno++; } fseek(ctl_fp, 0L, SEEK_SET); li = lineiter_start(ctl_fp); lineiter_free(li); run_len = lineno / parts; n_skip = (part - 1) * run_len; if (part == parts) run_len = UNTIL_EOF; return corpus_set_interval(n_skip, run_len); }
/********************************************************************* * * Function: corpus_set_ctl_filename * * Description: * This routine sets the control file used to define the corpus. * It has a side-effect of opening the control file. * * Function Inputs: * const char *ctl_filename - * This is the file name of the control file. * * Global Inputs: * None * * Return Values: * S3_SUCCESS - * Indicates the control file could be opened for reading. * * S3_ERROR - * Indicates some error occured while opening the control file. * * Global Outputs: * None * * Pre-Conditions: * ctl_filename argument must be a pointer to a C string. * * Post-Conditions: * *********************************************************************/ int corpus_set_ctl_filename(const char *ctl_filename) { lineiter_t *li; ctl_fp = fopen(ctl_filename, "rb"); if (ctl_fp == NULL) { E_ERROR_SYSTEM("Unable to open %s for reading", ctl_filename); return S3_ERROR; } li = lineiter_start_clean(ctl_fp); if (li == NULL) { E_ERROR("Must be at least one line in the control file\n"); return S3_ERROR; } parse_ctl_line(li->buf, &next_ctl_path, &next_ctl_sf, &next_ctl_ef, &next_ctl_utt_id); lineiter_free (li); return S3_SUCCESS; }
static int open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian) { char nist[7]; lineiter_t *li; FILE *fh; if ((fh = fopen(infile, "rb")) == NULL) { E_ERROR_SYSTEM("Failed to open %s", infile); return -1; } if (fread(&nist, 1, 7, fh) != 7) { E_ERROR_SYSTEM("Failed to read NIST header"); fclose(fh); return -1; } /* Is this actually a NIST file? */ if (0 != strncmp(nist, "NIST_1A", 7)) { fclose(fh); return FALSE; } /* Rewind, parse lines. */ fseek(fh, 0, SEEK_SET); for (li = lineiter_start(fh); li; li = lineiter_next(li)) { char **words; int nword; string_trim(li->buf, STRING_BOTH); if (strlen(li->buf) == 0) { lineiter_free(li); break; } nword = str2words(li->buf, NULL, 0); if (nword != 3) continue; words = (char **)ckd_calloc(nword, sizeof(*words)); str2words(li->buf, words, nword); if (0 == strcmp(words[0], "sample_rate")) { cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2])); } if (0 == strcmp(words[0], "channel_count")) { cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2])); } if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) { cmd_ln_set_str_r(wtf->config, "-input_endian", (0 == strcmp(words[2], "10")) ? "big" : "little"); } ckd_free(words); } fseek(fh, 1024, SEEK_SET); if (out_fh) *out_fh = fh; else fclose(fh); return TRUE; }
static int corpus_read_next_sent_file(char **trans) { FILE *fp; lineiter_t *li; /* open the current file */ fp = open_file_for_reading(DATA_TYPE_SENT); li = lineiter_start_clean(fp); if (li == NULL) { E_ERROR("Unable to read data in sent file %s\n", mk_filename(DATA_TYPE_SENT, cur_ctl_path)); return S3_ERROR; } *trans = strdup(li->buf); lineiter_free(li); fclose(fp); return S3_SUCCESS; }
int32 model_def_read(model_def_t **out_model_def, const char *file_name) { lineiter_t *li = NULL; uint32 n; char tag[32]; acmod_set_t *acmod_set; uint32 i, j; acmod_id_t acmod_id; uint32 tmat; uint32 n_state; uint32 n_tri; uint32 n_base; uint32 n_total_map; uint32 n_tied_state; uint32 n_tied_ci_state; uint32 n_tied_tmat; uint32 state[MAX_N_STATE]; uint32 n_total; model_def_t *omd; model_def_entry_t *mdef; uint32 *all_state; uint32 max_tmat; uint32 max_state; uint32 max_ci_state; FILE *fp; fp = fopen(file_name, "r"); if (fp == NULL) { E_WARN_SYSTEM("Unable to open %s for reading", file_name); return S3_ERROR; } li = lineiter_start_clean(fp); if (li == NULL) { E_ERROR("ERROR not even a version number in %s!?\n", file_name); fclose(fp); lineiter_free(li); return S3_ERROR; } if (strcmp(li->buf, MODEL_DEF_VERSION) != 0) { E_ERROR("ERROR version(%s) == \"%s\", but expected %s at line %d.\n", file_name, li->buf, MODEL_DEF_VERSION, lineiter_lineno(li)); fclose(fp); if (strcmp(li->buf, "0.1") == 0) { E_ERROR("You must add an attribute field to all the model records. See SPHINX-III File Formats manual\n"); } if (strcmp(li->buf, "0.2") == 0) { E_ERROR("You must add n_tied_state, n_tied_ci_state and n_tied_tmat definitions at the head of the file. See /net/alf19/usr2/eht/s3/cvtmdef.csh\n"); } lineiter_free(li); return S3_ERROR; } n_tri = n_base = n_total_map = n_tied_state = n_tied_ci_state = n_tied_tmat = NO_NUMBER; for ( i = 0; i < 6; i++) { li = lineiter_next(li); if (li == NULL) { E_ERROR("Incomplete count information in %s!?\n", file_name); fclose(fp); lineiter_free(li); return S3_ERROR; } sscanf(li->buf, "%u %s", &n, tag); if (strcmp(tag, "n_base") == 0) { n_base = n; } else if (strcmp(tag, "n_tri") == 0) { n_tri = n; } else if (strcmp(tag, "n_state_map") == 0) { n_total_map = n; } else if (strcmp(tag, "n_tied_state") == 0) { n_tied_state = n; } else if (strcmp(tag, "n_tied_ci_state") == 0) { n_tied_ci_state = n; } else if (strcmp(tag, "n_tied_tmat") == 0) { n_tied_tmat = n; } else { E_ERROR("Unknown tag %s in file at line %d\n", tag, lineiter_lineno(li)); fclose(fp); lineiter_free(li); return S3_ERROR; } } li = lineiter_next(li); *out_model_def = omd = ckd_calloc(1, sizeof(model_def_t)); omd->acmod_set = acmod_set = acmod_set_new(); /* give the acmod_set module some storage allocation requirements */ acmod_set_set_n_ci_hint(acmod_set, n_base); acmod_set_set_n_tri_hint(acmod_set, n_tri); n_total = n_base + n_tri; omd->defn = mdef = ckd_calloc(n_total, sizeof(model_def_entry_t)); omd->n_total_state = n_total_map; all_state = ckd_calloc(n_total_map, sizeof(uint32)); omd->n_tied_ci_state = n_tied_ci_state; omd->n_tied_state = n_tied_state; omd->n_tied_tmat = n_tied_tmat; omd->max_n_state = 0; omd->min_n_state = MAX_N_STATE; for (i = 0, j = 0, max_state = 0, max_ci_state = 0, max_tmat = 0; i < n_base; i++, j += n_state) { n_state = MAX_N_STATE; if (parse_base_line(li->buf, lineiter_lineno(li), &acmod_id, &tmat, state, &n_state, acmod_set) != S3_SUCCESS) { fclose(fp); lineiter_free(li); return S3_ERROR; } mdef[i].p = acmod_id; mdef[i].tmat = tmat; mdef[i].n_state = n_state; mdef[i].state = &all_state[j]; memcpy((char *)mdef[i].state, (const char *)state, n_state * sizeof(uint32)); update_totals(omd, &mdef[i]); li = lineiter_next(li); } for (; i < n_total; i++, j += n_state) { n_state = MAX_N_STATE; if (parse_tri_line(li->buf, lineiter_lineno(li), &acmod_id, &tmat, state, &n_state, acmod_set) != S3_SUCCESS) { fclose(fp); lineiter_free(li); return S3_ERROR; } mdef[i].p = acmod_id; mdef[i].tmat = tmat; mdef[i].n_state = n_state; mdef[i].state = &all_state[j]; memcpy((char *)mdef[i].state, (const char *)state, n_state * sizeof(uint32)); update_totals(omd, &mdef[i]); li = lineiter_next(li); } omd->n_defn = n_total; assert(j == n_total_map); E_INFO("Model definition info:\n"); E_INFO("%u total models defined (%u base, %u tri)\n", omd->n_defn, n_base, n_tri); E_INFO("%u total states\n", omd->n_total_state); E_INFO("%u total tied states\n", omd->n_tied_state); E_INFO("%u total tied CI states\n", omd->n_tied_ci_state); E_INFO("%u total tied transition matrices\n", omd->n_tied_tmat); E_INFO("%u max state/model\n", omd->max_n_state); E_INFO("%u min state/model\n", omd->min_n_state); fclose(fp); lineiter_free(li); return S3_SUCCESS; }
dtree_t * read_final_tree(FILE *fp, pset_t *pset, uint32 n_pset) { dtree_t *out; dtree_node_t *node; uint32 n_node; char *s, str[128]; lineiter_t *ln = NULL; uint32 n_scan; uint32 i, node_id, node_id_y, node_id_n; comp_quest_t *q; float64 ent; float32 occ; int err; out = ckd_calloc(1, sizeof(dtree_t)); ln = lineiter_start_clean(fp); s = ln->buf; sscanf(s, "%s%n", str, &n_scan); if (strcmp(str, "n_node") == 0) { s += n_scan; sscanf(s, "%u", &n_node); } else { E_FATAL("Format error; expecting n_node\n"); } out->n_node = n_node; out->node = node = ckd_calloc(n_node, sizeof(dtree_node_t)); for (i = 0; i < n_node; i++) node[i].node_id = i; err = FALSE; while ((ln = lineiter_next(ln))) { s = ln->buf; sscanf(s, "%u%n", &node_id, &n_scan); s += n_scan; sscanf(s, "%s%n", str, &n_scan); s += n_scan; if (strcmp(str, "-") == 0) { node_id_y = NO_ID; } else { node_id_y = atoi(str); } sscanf(s, "%s%n", str, &n_scan); s += n_scan; if (strcmp(str, "-") == 0) { node_id_n = NO_ID; } else { node_id_n = atoi(str); } sscanf(s, "%le%n", &ent, &n_scan); s += n_scan; sscanf(s, "%e%n", &occ, &n_scan); s += n_scan; if ((node_id_y != NO_ID) && (node_id_y != NO_ID)) { q = (comp_quest_t *)ckd_calloc(1, sizeof(comp_quest_t)); if (s3parse_comp_quest(pset, n_pset, q, s) != S3_SUCCESS) { err = TRUE; } node[node_id].q = q; } else node[node_id].q = NULL; /* ck if internal node */ if ((node_id_y != NO_ID) && (node_id_y != NO_ID)) node[node_id].wt_ent_dec = ent; else node[node_id].wt_ent = ent; node[node_id].occ = occ; if ((node_id_y != NO_ID) && (node_id_y != NO_ID)) { node[node_id].y = &node[node_id_y]; node[node_id].n = &node[node_id_n]; node[node_id_y].p = node[node_id_n].p = &node[node_id]; } else { node[node_id].y = NULL; node[node_id].n = NULL; } } if (err == TRUE) { free_tree(out); out = NULL; } lineiter_free(ln); return out; }
int corpus_next_utt() { lineiter_t *li; if (cur_ctl_path) { free(cur_ctl_path); } cur_ctl_path = next_ctl_path; if (cur_ctl_utt_id) { free(cur_ctl_utt_id); cur_ctl_utt_id = NULL; } cur_ctl_utt_id = next_ctl_utt_id; cur_ctl_sf = next_ctl_sf; cur_ctl_ef = next_ctl_ef; if (n_run != UNTIL_EOF) { if (n_run == 0) return FALSE; --n_run; } ++n_proc; if (cur_ctl_path == NULL || strlen(cur_ctl_path) == 0) return FALSE; /* if a big LSN file exists, position it to the correct line * corpus_set_ctl_filename() reads the first line of * the control file, so that transcription_fp is one line * behind ctl_fp. */ if (transcription_fp) { lineiter_t *trans_li; trans_li = lineiter_start_clean(transcription_fp); if (trans_li == NULL) { E_FATAL("File length mismatch at line %d in %s\n", n_proc, transcription_filename); } if (transcription_line) free(transcription_line); transcription_line = strdup(trans_li->buf); lineiter_free(trans_li); } li = lineiter_start_clean(ctl_fp); if (li != NULL) { parse_ctl_line(li->buf, &next_ctl_path, &next_ctl_sf, &next_ctl_ef, &next_ctl_utt_id); lineiter_free (li); } else { next_ctl_path = NULL; next_ctl_sf = NO_FRAME; next_ctl_ef = NO_FRAME; next_ctl_utt_id = NULL; } return TRUE; }
static int run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile) { hash_table_t *files; hash_iter_t *itor; lineiter_t *li; FILE *ctlfh; int nskip, runlen, npart, rv = 0; if ((ctlfh = fopen(ctlfile, "r")) == NULL) { E_ERROR_SYSTEM("Failed to open control file %s", ctlfile); return -1; } nskip = cmd_ln_int32_r(wtf->config, "-nskip"); runlen = cmd_ln_int32_r(wtf->config, "-runlen"); if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) { /* Count lines in the file. */ int partlen, part, nlines = 0; part = cmd_ln_int32_r(wtf->config, "-part"); for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) ++nlines; fseek(ctlfh, 0, SEEK_SET); partlen = nlines / npart; nskip = partlen * (part - 1); if (part == npart) runlen = -1; else runlen = partlen; } if (runlen != -1){ E_INFO("Processing %d utterances at position %d\n", runlen, nskip); files = hash_table_new(runlen, HASH_CASE_YES); } else { E_INFO("Processing all remaining utterances at position %d\n", nskip); files = hash_table_new(1000, HASH_CASE_YES); } for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) { char *c, *infile, *outfile; if (nskip-- > 0) continue; if (runlen == 0) { lineiter_free(li); break; } --runlen; string_trim(li->buf, STRING_BOTH); /* Extract the file ID from the control line. */ if ((c = strchr(li->buf, ' ')) != NULL) *c = '\0'; if (strlen(li->buf) == 0) { E_WARN("Empty line %d in control file, skipping\n", li->lineno); continue; } build_filenames(wtf->config, li->buf, &infile, &outfile); if (hash_table_lookup(files, infile, NULL) == 0) continue; rv = sphinx_wave2feat_convert_file(wtf, infile, outfile); hash_table_enter(files, infile, outfile); if (rv != 0) { lineiter_free(li); break; } } for (itor = hash_table_iter(files); itor; itor = hash_table_iter_next(itor)) { ckd_free((void *)hash_entry_key(itor->ent)); ckd_free(hash_entry_val(itor->ent)); } hash_table_free(files); if (fclose(ctlfh) == EOF) E_ERROR_SYSTEM("Failed to close control file"); return rv; }
ngram_model_t * ngram_model_trie_read_arpa(cmd_ln_t * config, const char *path, logmath_t * lmath) { FILE *fp; lineiter_t *li; ngram_model_trie_t *model; ngram_model_t *base; ngram_raw_t **raw_ngrams; int32 is_pipe; uint32 counts[NGRAM_MAX_ORDER]; uint32 fixed_counts[NGRAM_MAX_ORDER]; int order; int i; E_INFO("Trying to read LM in arpa format\n"); if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) { E_ERROR("File %s not found\n", path); return NULL; } model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model)); li = lineiter_start_clean(fp); /* Read n-gram counts from file */ if (read_counts_arpa(&li, counts, &order) == -1) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } E_INFO("LM of order %d\n", order); for (i = 0; i < order; i++) { E_INFO("#%d-grams: %d\n", i + 1, counts[i]); } base = &model->base; ngram_model_init(base, &ngram_model_trie_funcs, lmath, order, (int32) counts[0]); base->writable = TRUE; model->trie = lm_trie_create(counts[0], order); if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) { ckd_free(model); lineiter_free(li); fclose_comp(fp, is_pipe); return NULL; } if (order > 1) { raw_ngrams = ngrams_raw_read_arpa(&li, base->lmath, counts, order, base->wid); ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order); for (i = 0; i < order; i++) { base->n_counts[i] = fixed_counts[i]; } lm_trie_alloc_ngram(model->trie, fixed_counts, order); lm_trie_build(model->trie, raw_ngrams, counts, order); ngrams_raw_free(raw_ngrams, counts, order); } lineiter_free(li); fclose_comp(fp, is_pipe); return base; }
/********************************************************************* * * Function: * topo_read * * Description: * This routine reads an ASCII transition matrix which may then be * used to determine the topology of the models used in the system. * * Traceability: * * Function Inputs: * * Global Inputs: * None * * Return Values: * S3_SUCCESS is returned upon successful completion * S3_ERROR is returned upon an error condition * * Global Outputs: * None * * Errors: * * Pre-Conditions: * * Post-Conditions: * * Design: * * Notes: * *********************************************************************/ int32 topo_read(float32 ***tmat, uint32 *n_state_pm, const char *topo_file_name) { float32 **out; FILE *fp; lineiter_t *li = NULL; uint32 n_state; uint32 i, j; float32 row_sum; assert(topo_file_name != NULL); fp = fopen(topo_file_name, "r"); if (fp == NULL) { E_ERROR_SYSTEM("Unable to open %s for reading\n", topo_file_name); goto error; } li = lineiter_start_clean(fp); if (li == NULL) { E_ERROR("EOF encounted while reading version number in %s!?\n", topo_file_name); goto error; } if (strcmp(li->buf, TOPO_FILE_VERSION) != 0) { E_ERROR("Topo file version in %s is %s. Expected %s\n", topo_file_name, li->buf, TOPO_FILE_VERSION); goto error; } li = lineiter_next(li); if (li == NULL) { E_ERROR("EOF encountered while reading n_state in %s!?\n", topo_file_name); goto error; } sscanf(li->buf, "%d\n", &n_state); /* Support Request 1504066: robust reading of topo file in SphinxTrain When user put 0.1 1.0 1.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 instead of 0.1 4 1.0 1.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 topo_read will misread 1.0 into n_state as 1. And the generated transition matrix will corrupt bw as well. This problem is now fixed. */ if(n_state==1) { E_ERROR("n_state =1, if you are using a transition matrix with more than 1 state, this error might show that there is format issue in your input topology file. You are recommended to use perl/make_topology.pl to generate the topo file instead.\n"); goto error; } out = (float **)ckd_calloc_2d(n_state-1, n_state, sizeof(float32)); for (i = 0; i < n_state-1; i++) { row_sum = 0.0; for (j = 0; j < n_state; j++) { fscanf(fp, "%f", &out[i][j]); row_sum += out[i][j]; } for (j = 0; j < n_state; j++) { out[i][j] /= row_sum; } } *tmat = out; *n_state_pm = n_state; fclose(fp); lineiter_free(li); return S3_SUCCESS; error: if (fp) fclose(fp); lineiter_free(li); return S3_ERROR; }