Esempio n. 1
0
static int
ngrams_raw_read_order(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
                      hash_table_t * wid, logmath_t * lmath, uint32 count,
                      int order, int order_max)
{
    char expected_header[20];
    uint32 i;

    sprintf(expected_header, "\\%d-grams:", order);
    while (*li && strcmp((*li)->buf, expected_header) != 0) {
	*li = lineiter_next(*li);
    }
    
    if (*li == NULL) {
	E_ERROR("Failed to find '%s', language model file truncated\n", expected_header);
	return -1;
    }
    
    *raw_ngrams = (ngram_raw_t *) ckd_calloc(count, sizeof(ngram_raw_t));
    for (i = 0; i < count; i++) {
        if (read_ngram_instance(li, wid, lmath, order, order_max,
                            &((*raw_ngrams)[i])) < 0)
            break;
    }

    qsort(*raw_ngrams, count, sizeof(ngram_raw_t), &ngram_ord_comparator);
    return 0;
}
Esempio n. 2
0
/*********************************************************************
 *
 * Function: corpus_set_partition
 * 
 * Description: 
 *    This function allows one to specify a set R of a partition of
 *    the corpus into S (roughly) equal sized partitions.
 * 
 * Function Inputs: 
 *    uint32 r -
 *	This argument selects the Rth OF_S sets (R runs from 1..OF_S)
 *
 *    uint32 of_s -
 *	The number of total (roughly equal sized) sets in the partition.
 * 
 * Global Inputs: 
 *    None
 * 
 * Return Values: 
 *    S3_SUCCESS - Operation completed successfully
 *    S3_ERROR   - Operation did not complete successfully
 * 
 * Global Outputs: 
 *    None
 * 
 *********************************************************************/
int
corpus_set_partition(uint32 part,
		     uint32 parts)
{
    uint32 run_len;
    uint32 n_skip;
    int lineno = 0;
    lineiter_t* li;

    if (ctl_fp == NULL) {
	E_ERROR("Control file has not been set\n");

	return S3_ERROR;
    }

    for (li = lineiter_start(ctl_fp); li; li = lineiter_next(li)) {
	lineno++;
    }

    fseek(ctl_fp, 0L, SEEK_SET);
    
    li = lineiter_start(ctl_fp);
    lineiter_free(li);
    
    run_len = lineno / parts;

    n_skip = (part - 1) * run_len;

    if (part == parts)
	run_len = UNTIL_EOF;

    return corpus_set_interval(n_skip, run_len);
}
static void
evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
{
	FILE *fh;
        lineiter_t *litor;
	int32 nccs, noovs, nwords, lscr;
	float64 ch, log_to_log2;;

	if ((fh = fopen(lsnfn, "r")) == NULL)
		E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);

	/* We have to keep ch in floating-point to avoid overflows, so
	 * we might as well use log2. */
	log_to_log2 = log(logmath_get_base(lmath)) / log(2);
	lscr = nccs = noovs = nwords = 0;
	ch = 0.0;
        for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
		char **words;
		int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;

		n = str2words(litor->buf, NULL, 0);
		if (n < 0)
			E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
		if (n == 0) /* Do nothing! */
			continue;
		words = ckd_calloc(n, sizeof(*words));
		str2words(litor->buf, words, n);

		/* Remove any utterance ID (FIXME: has to be a single "word") */
		if (words[n-1][0] == '('
		    && words[n-1][strlen(words[n-1])-1] == ')')
			n = n - 1;

		tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
                                      &tmp_noovs, &tmp_lscr);

		ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
		nccs += tmp_nccs;
		noovs += tmp_noovs;
                lscr += tmp_lscr;
		nwords += n;
		
		ckd_free(words);
	}

	ch /= (nwords - nccs - noovs);
	printf("cross-entropy: %f bits\n", ch);

	/* Calculate perplexity pplx = exp CH */
	printf("perplexity: %f\n", pow(2.0, ch));
        printf("lm score: %d\n", lscr);

	/* Report OOVs and CCs */
	printf("%d words evaluated\n", nwords);
	printf("%d OOVs (%.2f%%), %d context cues removed\n",
	       noovs, (double)noovs / nwords * 100, nccs);
}
Esempio n. 4
0
static int
open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
{
    char nist[7];
    lineiter_t *li;
    FILE *fh;

    if ((fh = fopen(infile, "rb")) == NULL) {
        E_ERROR_SYSTEM("Failed to open %s", infile);
        return -1;
    }
    if (fread(&nist, 1, 7, fh) != 7) {
        E_ERROR_SYSTEM("Failed to read NIST header");
        fclose(fh);
        return -1;
    }
    /* Is this actually a NIST file? */
    if (0 != strncmp(nist, "NIST_1A", 7)) {
        fclose(fh);
        return FALSE;
    }
    /* Rewind, parse lines. */
    fseek(fh, 0, SEEK_SET);
    for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
        char **words;
        int nword;

        string_trim(li->buf, STRING_BOTH);
        if (strlen(li->buf) == 0) {
            lineiter_free(li);
            break;
        }
        nword = str2words(li->buf, NULL, 0);
        if (nword != 3)
            continue;
        words = (char **)ckd_calloc(nword, sizeof(*words));
        str2words(li->buf, words, nword);
        if (0 == strcmp(words[0], "sample_rate")) {
            cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
        }
        if (0 == strcmp(words[0], "channel_count")) {
            cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
        }
        if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
            cmd_ln_set_str_r(wtf->config, "-input_endian",
                             (0 == strcmp(words[2], "10")) ? "big" : "little");
        }
        ckd_free(words);
    }

    fseek(fh, 1024, SEEK_SET);
    if (out_fh)
        *out_fh = fh;
    else
        fclose(fh);
    return TRUE;
}
Esempio n. 5
0
/*
 * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
 */
static int
read_counts_arpa(lineiter_t ** li, uint32 * counts, int *order)
{
    int32 ngram, prev_ngram;
    uint32 ngram_cnt;

    /* skip file until past the '\data\' marker */
    while (*li) {
        if (strcmp((*li)->buf, "\\data\\") == 0)
            break;
        *li = lineiter_next(*li);
    }

    if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
        E_INFO("No \\data\\ mark in LM file\n");
        return -1;
    }

    prev_ngram = 0;
    *order = 0;
    while ((*li = lineiter_next(*li))) {
        if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
            break;
        if (ngram != prev_ngram + 1) {
            E_ERROR
                ("Ngram counts in LM file is not in order. %d goes after %d\n",
                 ngram, prev_ngram);
            return -1;
        }
        prev_ngram = ngram;
        counts[*order] = ngram_cnt;
        (*order)++;
    }

    if (*li == NULL) {
        E_ERROR("EOF while reading ngram counts\n");
        return -1;
    }

    return 0;
}
Esempio n. 6
0
lineiter_t *
lineiter_start_clean(FILE *fh)
{
    lineiter_t *li;
    
    li = lineiter_start(fh);
    
    if (li == NULL)
	return li;
    
    li->clean = TRUE;
    
    if (li->buf && li->buf[0] == '#') {
	li = lineiter_next(li);
    } else {
	string_trim(li->buf, STRING_BOTH);
    }
    
    return li;
}
Esempio n. 7
0
static void
ngrams_raw_read_order(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
                      hash_table_t * wid, logmath_t * lmath, uint32 count,
                      int order, int order_max)
{
    char expected_header[20];
    uint32 i;

    sprintf(expected_header, "\\%d-grams:", order);
    while (*li && strcmp((*li)->buf, expected_header) != 0) {
        *li = lineiter_next(*li);
    }

    *raw_ngrams = (ngram_raw_t *) ckd_calloc(count, sizeof(ngram_raw_t));
    for (i = 0; i < count; i++) {
        read_ngram_instance(li, wid, lmath, order, order_max,
                            &((*raw_ngrams)[i]));
    }

    //sort raw ngrams that was read
    ngram_comparator(NULL, &order);     //setting up order in comparator
    qsort(*raw_ngrams, count, sizeof(ngram_raw_t), &ngram_comparator);
}
Esempio n. 8
0
lineiter_t *
lineiter_start(FILE *fh)
{
    lineiter_t *li;

    li = (lineiter_t *)ckd_calloc(1, sizeof(*li));
    li->buf = (char *)ckd_malloc(128);
    li->buf[0] = '\0';
    li->bsiz = 128;
    li->len = 0;
    li->fh = fh;

    li = lineiter_next(li);
    
    /* Strip the UTF-8 BOM */
    
    if (li && 0 == strncmp(li->buf, "\xef\xbb\xbf", 3)) {
	memmove(li->buf, li->buf + 3, strlen(li->buf + 1));
	li->len -= 3;
    }
    
    return li;
}
Esempio n. 9
0
int32
model_def_read(model_def_t **out_model_def,
	       const char *file_name)
{
    lineiter_t *li = NULL;
    uint32 n;
    char tag[32];
    acmod_set_t *acmod_set;
    uint32 i, j;
    acmod_id_t acmod_id;
    uint32 tmat;
    uint32 n_state;
    uint32 n_tri;
    uint32 n_base;
    uint32 n_total_map;
    uint32 n_tied_state;
    uint32 n_tied_ci_state;
    uint32 n_tied_tmat;
    uint32 state[MAX_N_STATE];
    uint32 n_total;
    model_def_t *omd;
    model_def_entry_t *mdef;
    uint32 *all_state;
    uint32 max_tmat;
    uint32 max_state;
    uint32 max_ci_state;
    
    FILE *fp;

    fp = fopen(file_name, "r");
    if (fp == NULL) {
	E_WARN_SYSTEM("Unable to open %s for reading",
		      file_name);

	return S3_ERROR;
    }
    
    li = lineiter_start_clean(fp);

    if (li == NULL) {
	E_ERROR("ERROR not even a version number in %s!?\n",
		file_name);

	fclose(fp);

        lineiter_free(li);
	return S3_ERROR;
    }

    if (strcmp(li->buf, MODEL_DEF_VERSION) != 0) {
	E_ERROR("ERROR version(%s) == \"%s\", but expected %s at line %d.\n",
		file_name, li->buf, MODEL_DEF_VERSION, lineiter_lineno(li));

	fclose(fp);
	
	if (strcmp(li->buf, "0.1") == 0) {
	    E_ERROR("You must add an attribute field to all the model records.  See SPHINX-III File Formats manual\n");
	}
	
	if (strcmp(li->buf, "0.2") == 0) {
	    E_ERROR("You must add n_tied_state, n_tied_ci_state and n_tied_tmat definitions at the head of the file.  See /net/alf19/usr2/eht/s3/cvtmdef.csh\n");
	}
	
        lineiter_free(li);
	return S3_ERROR;
    }

    n_tri = n_base = n_total_map = n_tied_state = n_tied_ci_state = n_tied_tmat = NO_NUMBER;
    for ( i = 0; i < 6; i++) {
        li = lineiter_next(li);
        if (li == NULL) {
	    E_ERROR("Incomplete count information in %s!?\n",
		    file_name);
	    
	    fclose(fp);
            lineiter_free(li);
	    return S3_ERROR;
	}

	sscanf(li->buf, "%u %s", &n, tag);

	if (strcmp(tag, "n_base") == 0) {
	    n_base = n;
	}	
	else if (strcmp(tag, "n_tri") == 0) {
	    n_tri = n;
	}
	else if (strcmp(tag, "n_state_map") == 0) {
	    n_total_map = n;
	}
	else if (strcmp(tag, "n_tied_state") == 0) {
	    n_tied_state = n;
	}
	else if (strcmp(tag, "n_tied_ci_state") == 0) {
	    n_tied_ci_state = n;
	}
	else if (strcmp(tag, "n_tied_tmat") == 0) {
	    n_tied_tmat = n;
	}
	else {
	    E_ERROR("Unknown tag %s in file at line %d\n",
		    tag, lineiter_lineno(li));
	    	    
	    fclose(fp);

            lineiter_free(li);
	    return S3_ERROR;
	}
    }
    li = lineiter_next(li);

    *out_model_def = omd = ckd_calloc(1, sizeof(model_def_t));
    omd->acmod_set = acmod_set = acmod_set_new();

    /* give the acmod_set module some storage allocation requirements */
    acmod_set_set_n_ci_hint(acmod_set, n_base);
    acmod_set_set_n_tri_hint(acmod_set, n_tri);

    n_total = n_base + n_tri;

    omd->defn = mdef = ckd_calloc(n_total, sizeof(model_def_entry_t));
    omd->n_total_state = n_total_map;

    all_state = ckd_calloc(n_total_map, sizeof(uint32));
    
    omd->n_tied_ci_state = n_tied_ci_state;
    omd->n_tied_state = n_tied_state;
    omd->n_tied_tmat = n_tied_tmat;

    omd->max_n_state = 0;
    omd->min_n_state = MAX_N_STATE;

    for (i = 0, j = 0, max_state = 0, max_ci_state = 0, max_tmat = 0;
	 i < n_base; i++, j += n_state) {
	n_state = MAX_N_STATE;
	if (parse_base_line(li->buf,
			    lineiter_lineno(li),
	                    &acmod_id,
			    &tmat,
			    state,
			    &n_state,
			    acmod_set) != S3_SUCCESS) {

	    fclose(fp);
            lineiter_free(li);
            return S3_ERROR;
	}

	mdef[i].p = acmod_id;
	mdef[i].tmat = tmat;
	mdef[i].n_state = n_state;
	mdef[i].state = &all_state[j];

	memcpy((char *)mdef[i].state, (const char *)state,
	       n_state * sizeof(uint32));

	update_totals(omd, &mdef[i]);

	li = lineiter_next(li);
    }

    for (; i < n_total; i++, j += n_state) {
	n_state = MAX_N_STATE;

	if (parse_tri_line(li->buf,
			   lineiter_lineno(li),
			   &acmod_id,
			   &tmat,
			   state,
			   &n_state,
			   acmod_set) != S3_SUCCESS) {
	    fclose(fp);
            lineiter_free(li);
	    return S3_ERROR;
	}

	mdef[i].p = acmod_id;
	mdef[i].tmat = tmat;
	mdef[i].n_state = n_state;
	mdef[i].state = &all_state[j];
	memcpy((char *)mdef[i].state,
	       (const char *)state,
	       n_state * sizeof(uint32));

	update_totals(omd, &mdef[i]);
	li = lineiter_next(li);
    }

    omd->n_defn = n_total;

    assert(j == n_total_map);
    
    E_INFO("Model definition info:\n");
    E_INFO("%u total models defined (%u base, %u tri)\n", omd->n_defn, n_base, n_tri);
    E_INFO("%u total states\n", omd->n_total_state);
    E_INFO("%u total tied states\n", omd->n_tied_state);
    E_INFO("%u total tied CI states\n", omd->n_tied_ci_state);
    E_INFO("%u total tied transition matrices\n", omd->n_tied_tmat);
    E_INFO("%u max state/model\n", omd->max_n_state);
    E_INFO("%u min state/model\n", omd->min_n_state);

    fclose(fp);

    lineiter_free(li);
    return S3_SUCCESS;
}
Esempio n. 10
0
static void
read_ngram_instance(lineiter_t ** li, hash_table_t * wid,
                    logmath_t * lmath, int order, int order_max,
                    ngram_raw_t * raw_ngram)
{
    int n;
    int words_expected;
    int i;
    char *wptr[NGRAM_MAX_ORDER + 1];
    uint32 *word_out;

    *li = lineiter_next(*li);
    if (*li == NULL) {
        E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n",
                order);
        return;
    }
    string_trim((*li)->buf, STRING_BOTH);
    words_expected = order + 1;

    if ((n =
         str2words((*li)->buf, wptr,
                   NGRAM_MAX_ORDER + 1)) < words_expected) {
        if ((*li)->buf[0] != '\0') {
            E_WARN("Format error; %d-gram ignored: %s\n", order,
                   (*li)->buf);
        }
    }
    else {
        if (order == order_max) {
            raw_ngram->weights =
                (float *) ckd_calloc(1, sizeof(*raw_ngram->weights));
            raw_ngram->weights[0] = atof_c(wptr[0]);
            if (raw_ngram->weights[0] > 0) {
                E_WARN("%d-gram [%s] has positive probability. Zeroize\n",
                       order, wptr[1]);
                raw_ngram->weights[0] = 0.0f;
            }
            raw_ngram->weights[0] =
                logmath_log10_to_log_float(lmath, raw_ngram->weights[0]);
        }
        else {
            float weight, backoff;
            raw_ngram->weights =
                (float *) ckd_calloc(2, sizeof(*raw_ngram->weights));

            weight = atof_c(wptr[0]);
            if (weight > 0) {
                E_WARN("%d-gram [%s] has positive probability. Zeroize\n",
                       order, wptr[1]);
                raw_ngram->weights[0] = 0.0f;
            }
            else {
                raw_ngram->weights[0] =
                    logmath_log10_to_log_float(lmath, weight);
            }

            if (n == order + 1) {
                raw_ngram->weights[1] = 0.0f;
            }
            else {
                backoff = atof_c(wptr[order + 1]);
                raw_ngram->weights[1] =
                    logmath_log10_to_log_float(lmath, backoff);
            }
        }
        raw_ngram->words =
            (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words));
        for (word_out = raw_ngram->words + order - 1, i = 1;
             word_out >= raw_ngram->words; --word_out, i++) {
            hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out);
        }
    }
}
Esempio n. 11
0
dtree_t *
read_final_tree(FILE *fp,
		pset_t *pset,
		uint32 n_pset)
{
    dtree_t *out;
    dtree_node_t *node;
    uint32  n_node;
    char *s, str[128];
    lineiter_t *ln = NULL;
    uint32 n_scan;
    uint32 i, node_id, node_id_y, node_id_n;
    comp_quest_t *q;
    float64 ent;
    float32 occ;
    int err;

    out = ckd_calloc(1, sizeof(dtree_t));

    ln = lineiter_start_clean(fp);
    
    s = ln->buf;
    sscanf(s, "%s%n", str, &n_scan);
    if (strcmp(str, "n_node") == 0) {
	s += n_scan;
	sscanf(s, "%u", &n_node);
    }
    else {
	E_FATAL("Format error; expecting n_node\n");
    }

    out->n_node = n_node;
    out->node = node = ckd_calloc(n_node, sizeof(dtree_node_t));

    for (i = 0; i < n_node; i++)
	node[i].node_id = i;
    
    err = FALSE;
    
    while ((ln = lineiter_next(ln))) {
	s = ln->buf;

	sscanf(s, "%u%n", &node_id, &n_scan);
	s += n_scan;
	sscanf(s, "%s%n", str, &n_scan);
	s += n_scan;
	if (strcmp(str, "-") == 0) {
	    node_id_y = NO_ID;
	}
	else {
	    node_id_y = atoi(str);
	}
	sscanf(s, "%s%n", str, &n_scan);
	s += n_scan;
	if (strcmp(str, "-") == 0) {
	    node_id_n = NO_ID;
	}
	else {
	    node_id_n = atoi(str);
	}
	sscanf(s, "%le%n", &ent, &n_scan);
	s += n_scan;
	sscanf(s, "%e%n", &occ, &n_scan);
	s += n_scan;

	if ((node_id_y != NO_ID) && (node_id_y != NO_ID)) {
	    q = (comp_quest_t *)ckd_calloc(1, sizeof(comp_quest_t));
	    if (s3parse_comp_quest(pset, n_pset, q, s) != S3_SUCCESS) {
		err = TRUE;
	    }

	    node[node_id].q = q;
	}
	else
	    node[node_id].q = NULL;

	/* ck if internal node */
	if ((node_id_y != NO_ID) && (node_id_y != NO_ID))
	    node[node_id].wt_ent_dec = ent;
	else
	    node[node_id].wt_ent = ent;

	node[node_id].occ = occ;

	if ((node_id_y != NO_ID) && (node_id_y != NO_ID)) {
	    node[node_id].y = &node[node_id_y];
	    node[node_id].n = &node[node_id_n];
	    node[node_id_y].p = node[node_id_n].p = &node[node_id];
	}
	else {
	    node[node_id].y = NULL;
	    node[node_id].n = NULL;
	}
    }

    if (err == TRUE) {
	free_tree(out);
	out = NULL;
    }

    lineiter_free(ln);
    return out;
}
Esempio n. 12
0
dict_t *
dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
{
    FILE *fp, *fp2;
    int32 n;
    lineiter_t *li;
    dict_t *d;
    s3cipid_t sil;
    char const *dictfile = NULL, *fillerfile = NULL;

    if (config) {
        dictfile = cmd_ln_str_r(config, "-dict");
        fillerfile = cmd_ln_str_r(config, "-fdict");
    }

    /*
     * First obtain #words in dictionary (for hash table allocation).
     * Reason: The PC NT system doesn't like to grow memory gradually.  Better to allocate
     * all the required memory in one go.
     */
    fp = NULL;
    n = 0;
    if (dictfile) {
        if ((fp = fopen(dictfile, "r")) == NULL)
            E_FATAL_SYSTEM("Failed to open dictionary file '%s' for reading", dictfile);
        for (li = lineiter_start(fp); li; li = lineiter_next(li)) {
            if (li->buf[0] != '#')
                n++;
        }
        rewind(fp);
    }

    fp2 = NULL;
    if (fillerfile) {
        if ((fp2 = fopen(fillerfile, "r")) == NULL)
            E_FATAL_SYSTEM("Failed to open filler dictionary file '%s' for reading", fillerfile);
        for (li = lineiter_start(fp2); li; li = lineiter_next(li)) {
            if (li->buf[0] != '#')
                n++;
        }
        rewind(fp2);
    }

    /*
     * Allocate dict entries.  HACK!!  Allow some extra entries for words not in file.
     * Also check for type size restrictions.
     */
    d = (dict_t *) ckd_calloc(1, sizeof(dict_t));       /* freed in dict_free() */
    d->refcnt = 1;
    d->max_words =
        (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID;
    if (n >= MAX_S3WID)
        E_FATAL("#Words in dictionaries (%d) exceeds limit (%d)\n", n,
                MAX_S3WID);

    E_INFO("Allocating %d * %d bytes (%d KiB) for word entries\n",
           d->max_words, sizeof(dictword_t),
           d->max_words * sizeof(dictword_t) / 1024);
    d->word = (dictword_t *) ckd_calloc(d->max_words, sizeof(dictword_t));      /* freed in dict_free() */
    d->n_word = 0;
    if (mdef)
        d->mdef = bin_mdef_retain(mdef);

    /* Create new hash table for word strings; case-insensitive word strings */
    if (config && cmd_ln_exists_r(config, "-dictcase"))
        d->nocase = cmd_ln_boolean_r(config, "-dictcase");
    d->ht = hash_table_new(d->max_words, d->nocase);

    /* Digest main dictionary file */
    if (fp) {
        E_INFO("Reading main dictionary: %s\n", dictfile);
        dict_read(fp, d);
        fclose(fp);
        E_INFO("%d words read\n", d->n_word);
    }

    /* Now the filler dictionary file, if it exists */
    d->filler_start = d->n_word;
    if (fillerfile) {
        E_INFO("Reading filler dictionary: %s\n", fillerfile);
        dict_read(fp2, d);
        fclose(fp2);
        E_INFO("%d words read\n", d->n_word - d->filler_start);
    }
    if (mdef)
        sil = bin_mdef_silphone(mdef);
    else
        sil = 0;
    if (dict_wordid(d, S3_START_WORD) == BAD_S3WID) {
        dict_add_word(d, S3_START_WORD, &sil, 1);
    }
    if (dict_wordid(d, S3_FINISH_WORD) == BAD_S3WID) {
        dict_add_word(d, S3_FINISH_WORD, &sil, 1);
    }
    if (dict_wordid(d, S3_SILENCE_WORD) == BAD_S3WID) {
        dict_add_word(d, S3_SILENCE_WORD, &sil, 1);
    }

    d->filler_end = d->n_word - 1;

    /* Initialize distinguished word-ids */
    d->startwid = dict_wordid(d, S3_START_WORD);
    d->finishwid = dict_wordid(d, S3_FINISH_WORD);
    d->silwid = dict_wordid(d, S3_SILENCE_WORD);

    if ((d->filler_start > d->filler_end)
        || (!dict_filler_word(d, d->silwid)))
        E_FATAL("%s must occur (only) in filler dictionary\n",
                S3_SILENCE_WORD);

    /* No check that alternative pronunciations for filler words are in filler range!! */

    return d;
}
Esempio n. 13
0
static int32
dict_read(FILE * fp, dict_t * d)
{
    lineiter_t *li;
    char **wptr;
    s3cipid_t *p;
    int32 lineno, nwd;
    s3wid_t w;
    int32 i, maxwd;
    size_t stralloc, phnalloc;

    maxwd = 512;
    p = (s3cipid_t *) ckd_calloc(maxwd + 4, sizeof(*p));
    wptr = (char **) ckd_calloc(maxwd, sizeof(char *)); /* Freed below */

    lineno = 0;
    stralloc = phnalloc = 0;
    for (li = lineiter_start(fp); li; li = lineiter_next(li)) {
        lineno++;
        if (0 == strncmp(li->buf, "##", 2)
            || 0 == strncmp(li->buf, ";;", 2))
            continue;

        if ((nwd = str2words(li->buf, wptr, maxwd)) < 0) {
            /* Increase size of p, wptr. */
            nwd = str2words(li->buf, NULL, 0);
            assert(nwd > maxwd); /* why else would it fail? */
            maxwd = nwd;
            p = (s3cipid_t *) ckd_realloc(p, (maxwd + 4) * sizeof(*p));
            wptr = (char **) ckd_realloc(wptr, maxwd * sizeof(*wptr));
        }

        if (nwd == 0)           /* Empty line */
            continue;
        /* wptr[0] is the word-string and wptr[1..nwd-1] the pronunciation sequence */
        if (nwd == 1) {
            E_ERROR("Line %d: No pronunciation for word %s; ignored\n",
                    lineno, wptr[0]);
            continue;
        }


        /* Convert pronunciation string to CI-phone-ids */
        for (i = 1; i < nwd; i++) {
            p[i - 1] = dict_ciphone_id(d, wptr[i]);
            if (NOT_S3CIPID(p[i - 1])) {
                E_ERROR("Line %d: Bad ciphone: %s; word %s ignored\n",
                        lineno, wptr[i], wptr[0]);
                break;
            }
        }

        if (i == nwd) {         /* All CI-phones successfully converted to IDs */
            w = dict_add_word(d, wptr[0], p, nwd - 1);
            if (NOT_S3WID(w))
                E_ERROR
                    ("Line %d: dict_add_word (%s) failed (duplicate?); ignored\n",
                     lineno, wptr[0]);
            else {
                stralloc += strlen(d->word[w].word);
                phnalloc += d->word[w].pronlen * sizeof(s3cipid_t);
            }
        }
    }
    E_INFO("Allocated %d KiB for strings, %d KiB for phones\n",
           (int)stralloc / 1024, (int)phnalloc / 1024);
    ckd_free(p);
    ckd_free(wptr);

    return 0;
}
Esempio n. 14
0
static int
read_ngram_instance(lineiter_t ** li, hash_table_t * wid,
                    logmath_t * lmath, int order, int order_max,
                    ngram_raw_t * raw_ngram)
{
    int n;
    int words_expected;
    int i;
    char *wptr[NGRAM_MAX_ORDER + 1];
    uint32 *word_out;

    if (*li) 
        *li = lineiter_next(*li);
    if (*li == NULL) {
        E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n",
                order);
        return -1;
    }
    words_expected = order + 1;
    if ((n =
         str2words((*li)->buf, wptr,
                   NGRAM_MAX_ORDER + 1)) < words_expected) {
        E_ERROR("Format error; %d-gram ignored: %s\n", order, (*li)->buf);
        return -1;
    }

    raw_ngram->order = order;

    if (order == order_max) {
        raw_ngram->prob = atof_c(wptr[0]);
        if (raw_ngram->prob > 0) {
            E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]);
            raw_ngram->prob = 0.0f;
        }
        raw_ngram->prob =
            logmath_log10_to_log_float(lmath, raw_ngram->prob);
    }
    else {
        float weight, backoff;

        weight = atof_c(wptr[0]);
        if (weight > 0) {
            E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]);
            raw_ngram->prob = 0.0f;
        }
        else {
            raw_ngram->prob =
                logmath_log10_to_log_float(lmath, weight);
        }

        if (n == order + 1) {
            raw_ngram->backoff = 0.0f;
        }
        else {
            backoff = atof_c(wptr[order + 1]);
            raw_ngram->backoff =
                logmath_log10_to_log_float(lmath, backoff);
        }
    }
    raw_ngram->words =
        (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words));
    for (word_out = raw_ngram->words + order - 1, i = 1;
         word_out >= raw_ngram->words; --word_out, i++) {
        hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out);
    }
    return 0;
}
Esempio n. 15
0
static int
run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
{
    hash_table_t *files;
    hash_iter_t *itor;
    lineiter_t *li;
    FILE *ctlfh;
    int nskip, runlen, npart, rv = 0;

    if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
        E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
        return -1;
    }
    nskip = cmd_ln_int32_r(wtf->config, "-nskip");
    runlen = cmd_ln_int32_r(wtf->config, "-runlen");
    if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
        /* Count lines in the file. */
        int partlen, part, nlines = 0;
        part = cmd_ln_int32_r(wtf->config, "-part");
        for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
            ++nlines;
        fseek(ctlfh, 0, SEEK_SET);
        partlen = nlines / npart;
        nskip = partlen * (part - 1);
        if (part == npart)
            runlen = -1;
        else
            runlen = partlen;
    }
    if (runlen != -1){
        E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
        files = hash_table_new(runlen, HASH_CASE_YES);
    }
    else {
        E_INFO("Processing all remaining utterances at position %d\n", nskip);
        files = hash_table_new(1000, HASH_CASE_YES);
    }
    for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
        char *c, *infile, *outfile;

        if (nskip-- > 0)
            continue;
        if (runlen == 0) {
            lineiter_free(li);
            break;
        }
        --runlen;

        string_trim(li->buf, STRING_BOTH);
        /* Extract the file ID from the control line. */
        if ((c = strchr(li->buf, ' ')) != NULL)
            *c = '\0';
        if (strlen(li->buf) == 0) {
    	    E_WARN("Empty line %d in control file, skipping\n", li->lineno);
    	    continue;
        }
        build_filenames(wtf->config, li->buf, &infile, &outfile);
        if (hash_table_lookup(files, infile, NULL) == 0)
            continue;
        rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
        hash_table_enter(files, infile, outfile);
        if (rv != 0) {
            lineiter_free(li);
            break;
        }
    }
    for (itor = hash_table_iter(files); itor;
         itor = hash_table_iter_next(itor)) {
        ckd_free((void *)hash_entry_key(itor->ent));
        ckd_free(hash_entry_val(itor->ent));
    }
    hash_table_free(files);

    if (fclose(ctlfh) == EOF)
        E_ERROR_SYSTEM("Failed to close control file");
    return rv;
}
Esempio n. 16
0
static int
read_1grams_arpa(lineiter_t ** li, uint32 count, ngram_model_t * base,
                 unigram_t * unigrams)
{
    uint32 i;
    int n;
    int n_parts;
    char *wptr[3];

    while (*li && strcmp((*li)->buf, "\\1-grams:") != 0) {
	*li = lineiter_next(*li);
    }
    if (*li == NULL) {
        E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
        return -1;
    }

    n_parts = 2;
    for (i = 0; i < count; i++) {
        *li = lineiter_next(*li);
        if (*li == NULL) {
            E_ERROR
                ("Unexpected end of ARPA file. Failed to read %dth unigram\n",
                 i + 1);
            return -1;
        }
        if ((n = str2words((*li)->buf, wptr, 3)) < n_parts) {
            E_ERROR("Format error at line %s, Failed to read unigrams\n", (*li)->buf);
            return -1;
        }

        unigram_t *unigram = &unigrams[i];
        unigram->prob =
            logmath_log10_to_log_float(base->lmath, atof_c(wptr[0]));
        if (unigram->prob > 0) {
            E_WARN("Unigram '%s' has positive probability\n", wptr[1]);
            unigram->prob = 0;
        }
        if (n == n_parts + 1) {
            unigram->bo =
                logmath_log10_to_log_float(base->lmath,
                                           atof_c(wptr[2]));
        }
        else {
            unigram->bo = 0.0f;
        }

        /* TODO: classify float with fpclassify and warn if bad value occurred */
        base->word_str[i] = ckd_salloc(wptr[1]);
    }

    /* fill hash-table that maps unigram names to their word ids */
    for (i = 0; i < count; i++) {
        if ((hash_table_enter
             (base->wid, base->word_str[i],
              (void *) (long) i)) != (void *) (long) i) {
            E_WARN("Duplicate word in dictionary: %s\n",
                   base->word_str[i]);
        }
    }
    return 0;
}
Esempio n. 17
0
int batch_decoder_run(batch_decoder_t *bd)
{
    int32 ctloffset, ctlcount, ctlincr;
    lineiter_t *li, *ali = NULL;

    search_run(bd->fwdtree);
    search_run(bd->fwdflat);

    ctloffset = cmd_ln_int32_r(bd->config, "-ctloffset");
    ctlcount = cmd_ln_int32_r(bd->config, "-ctlcount");
    ctlincr = cmd_ln_int32_r(bd->config, "-ctlincr");

    if (bd->alignfh)
        ali = lineiter_start(bd->alignfh);
    for (li = lineiter_start(bd->ctlfh); li; li = lineiter_next(li)) {
        alignment_t *al = NULL;
        char *wptr[4];
        int32 nf, sf, ef;

        if (li->lineno < ctloffset) {
            if (ali)
                ali = lineiter_next(ali);
            continue;
        }
        if ((li->lineno - ctloffset) % ctlincr != 0) {
            if (ali)
                ali = lineiter_next(ali);
            continue;
        }
        if (ctlcount != -1 && li->lineno >= ctloffset + ctlcount)
            break;
        if (ali)
            al = parse_alignment(ali->buf, search_factory_d2p(bd->sf));
        sf = 0;
        ef = -1;
        nf = str2words(li->buf, wptr, 4);
        if (nf == 0) {
            /* Do nothing. */
        }
        else if (nf < 0) {
            E_ERROR("Unexpected extra data in control file at line %d\n", li->lineno);
        }
        else
        {
            char *file, *uttid;
            file = wptr[0];
            uttid = NULL;
            if (nf > 1)
            sf = atoi(wptr[1]);
            if (nf > 2)
            ef = atoi(wptr[2]);
            if (nf > 3)
            uttid = wptr[3];
            /* Do actual decoding. */
            batch_decoder_decode(bd, file, uttid, sf, ef, al);
        }
        alignment_free(al);
        if (ali) ali = lineiter_next(ali);
    }
    featbuf_producer_shutdown(search_factory_featbuf(bd->sf));
    return 0;
}
Esempio n. 18
0
/*********************************************************************
 *
 * Function: 
 *	topo_read
 * 
 * Description: 
 * 	This routine reads an ASCII transition matrix which may then be
 *	used to determine the topology of the models used in the system.
 *
 * Traceability: 
 * 
 * Function Inputs: 
 * 
 * Global Inputs: 
 *	None
 * 
 * Return Values: 
 *	S3_SUCCESS is returned upon successful completion
 *	S3_ERROR is returned upon an error condition
 * 
 * Global Outputs: 
 *	None
 * 
 * Errors: 
 * 
 * Pre-Conditions: 
 * 
 * Post-Conditions: 
 * 
 * Design: 
 * 
 * Notes: 
 * 
 *********************************************************************/
int32
topo_read(float32 ***tmat,
	  uint32 *n_state_pm,
	  const char *topo_file_name)
{
    float32 **out;
    FILE *fp;
    lineiter_t *li = NULL;
    uint32 n_state;
    uint32 i, j;
    float32 row_sum;

    assert(topo_file_name != NULL);

    fp = fopen(topo_file_name, "r");
    if (fp == NULL) {
	E_ERROR_SYSTEM("Unable to open %s for reading\n", topo_file_name);

	goto error;
    }
    
    li = lineiter_start_clean(fp);
    
    if (li == NULL) {
	E_ERROR("EOF encounted while reading version number in %s!?\n", topo_file_name);

	goto error;
    }

    if (strcmp(li->buf, TOPO_FILE_VERSION) != 0) {
	E_ERROR("Topo file version in %s is %s.  Expected %s\n",
		topo_file_name, li->buf, TOPO_FILE_VERSION);

	goto error;
    }

    li = lineiter_next(li);
    if (li == NULL) {
	E_ERROR("EOF encountered while reading n_state in %s!?\n", topo_file_name);

	goto error;
    }

    sscanf(li->buf, "%d\n", &n_state);

    /* Support Request 1504066: robust reading of topo file in
       SphinxTrain
	   
       When user put 
       0.1
       1.0 1.0 1.0 0.0
       1.0 1.0 1.0 0.0
       1.0 1.0 1.0 1.0 
       
       instead of 
       
       0.1
       4
       1.0 1.0 1.0 0.0
       1.0 1.0 1.0 0.0
       1.0 1.0 1.0 1.0
       
       topo_read will misread 1.0 into n_state as 1.  And the 
       generated transition matrix will corrupt bw as well. This 
       problem is now fixed. 
    */

    if(n_state==1) {
        E_ERROR("n_state =1, if you are using a transition matrix with more than 1 state, this error might show that there is format issue in your input topology file.  You are recommended to use perl/make_topology.pl to generate the topo file instead.\n");
	goto error;
    }

    out = (float **)ckd_calloc_2d(n_state-1, n_state, sizeof(float32));

    for (i = 0; i < n_state-1; i++) {
	row_sum = 0.0;
	for (j = 0; j < n_state; j++) {
	    fscanf(fp, "%f", &out[i][j]);
	    row_sum += out[i][j];
	}
	for (j = 0; j < n_state; j++) {
	    out[i][j] /= row_sum;
	}
    }
    
    *tmat = out;
    *n_state_pm = n_state;

    fclose(fp);
    lineiter_free(li);
    return S3_SUCCESS;

error:    
    if (fp) fclose(fp);
    lineiter_free(li);
    return S3_ERROR;
}