Esempio n. 1
0
/*********************************************************************
 *
 * Function: corpus_set_partition
 * 
 * Description: 
 *    This function allows one to specify a set R of a partition of
 *    the corpus into S (roughly) equal sized partitions.
 * 
 * Function Inputs: 
 *    uint32 r -
 *	This argument selects the Rth OF_S sets (R runs from 1..OF_S)
 *
 *    uint32 of_s -
 *	The number of total (roughly equal sized) sets in the partition.
 * 
 * Global Inputs: 
 *    None
 * 
 * Return Values: 
 *    S3_SUCCESS - Operation completed successfully
 *    S3_ERROR   - Operation did not complete successfully
 * 
 * Global Outputs: 
 *    None
 * 
 *********************************************************************/
int
corpus_set_partition(uint32 part,
		     uint32 parts)
{
    uint32 run_len;
    uint32 n_skip;
    int lineno = 0;
    lineiter_t* li;

    if (ctl_fp == NULL) {
	E_ERROR("Control file has not been set\n");

	return S3_ERROR;
    }

    for (li = lineiter_start(ctl_fp); li; li = lineiter_next(li)) {
	lineno++;
    }

    fseek(ctl_fp, 0L, SEEK_SET);
    
    li = lineiter_start(ctl_fp);
    lineiter_free(li);
    
    run_len = lineno / parts;

    n_skip = (part - 1) * run_len;

    if (part == parts)
	run_len = UNTIL_EOF;

    return corpus_set_interval(n_skip, run_len);
}
static void
evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
{
	FILE *fh;
        lineiter_t *litor;
	int32 nccs, noovs, nwords, lscr;
	float64 ch, log_to_log2;;

	if ((fh = fopen(lsnfn, "r")) == NULL)
		E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);

	/* We have to keep ch in floating-point to avoid overflows, so
	 * we might as well use log2. */
	log_to_log2 = log(logmath_get_base(lmath)) / log(2);
	lscr = nccs = noovs = nwords = 0;
	ch = 0.0;
        for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
		char **words;
		int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;

		n = str2words(litor->buf, NULL, 0);
		if (n < 0)
			E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
		if (n == 0) /* Do nothing! */
			continue;
		words = ckd_calloc(n, sizeof(*words));
		str2words(litor->buf, words, n);

		/* Remove any utterance ID (FIXME: has to be a single "word") */
		if (words[n-1][0] == '('
		    && words[n-1][strlen(words[n-1])-1] == ')')
			n = n - 1;

		tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
                                      &tmp_noovs, &tmp_lscr);

		ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
		nccs += tmp_nccs;
		noovs += tmp_noovs;
                lscr += tmp_lscr;
		nwords += n;
		
		ckd_free(words);
	}

	ch /= (nwords - nccs - noovs);
	printf("cross-entropy: %f bits\n", ch);

	/* Calculate perplexity pplx = exp CH */
	printf("perplexity: %f\n", pow(2.0, ch));
        printf("lm score: %d\n", lscr);

	/* Report OOVs and CCs */
	printf("%d words evaluated\n", nwords);
	printf("%d OOVs (%.2f%%), %d context cues removed\n",
	       noovs, (double)noovs / nwords * 100, nccs);
}
Esempio n. 3
0
static int
open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
{
    char nist[7];
    lineiter_t *li;
    FILE *fh;

    if ((fh = fopen(infile, "rb")) == NULL) {
        E_ERROR_SYSTEM("Failed to open %s", infile);
        return -1;
    }
    if (fread(&nist, 1, 7, fh) != 7) {
        E_ERROR_SYSTEM("Failed to read NIST header");
        fclose(fh);
        return -1;
    }
    /* Is this actually a NIST file? */
    if (0 != strncmp(nist, "NIST_1A", 7)) {
        fclose(fh);
        return FALSE;
    }
    /* Rewind, parse lines. */
    fseek(fh, 0, SEEK_SET);
    for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
        char **words;
        int nword;

        string_trim(li->buf, STRING_BOTH);
        if (strlen(li->buf) == 0) {
            lineiter_free(li);
            break;
        }
        nword = str2words(li->buf, NULL, 0);
        if (nword != 3)
            continue;
        words = (char **)ckd_calloc(nword, sizeof(*words));
        str2words(li->buf, words, nword);
        if (0 == strcmp(words[0], "sample_rate")) {
            cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
        }
        if (0 == strcmp(words[0], "channel_count")) {
            cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
        }
        if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
            cmd_ln_set_str_r(wtf->config, "-input_endian",
                             (0 == strcmp(words[2], "10")) ? "big" : "little");
        }
        ckd_free(words);
    }

    fseek(fh, 1024, SEEK_SET);
    if (out_fh)
        *out_fh = fh;
    else
        fclose(fh);
    return TRUE;
}
Esempio n. 4
0
lineiter_t *
lineiter_start_clean(FILE *fh)
{
    lineiter_t *li;
    
    li = lineiter_start(fh);
    
    if (li == NULL)
	return li;
    
    li->clean = TRUE;
    
    if (li->buf && li->buf[0] == '#') {
	li = lineiter_next(li);
    } else {
	string_trim(li->buf, STRING_BOTH);
    }
    
    return li;
}
Esempio n. 5
0
dict_t *
dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
{
    FILE *fp, *fp2;
    int32 n;
    lineiter_t *li;
    dict_t *d;
    s3cipid_t sil;
    char const *dictfile = NULL, *fillerfile = NULL;

    if (config) {
        dictfile = cmd_ln_str_r(config, "-dict");
        fillerfile = cmd_ln_str_r(config, "-fdict");
    }

    /*
     * First obtain #words in dictionary (for hash table allocation).
     * Reason: The PC NT system doesn't like to grow memory gradually.  Better to allocate
     * all the required memory in one go.
     */
    fp = NULL;
    n = 0;
    if (dictfile) {
        if ((fp = fopen(dictfile, "r")) == NULL)
            E_FATAL_SYSTEM("Failed to open dictionary file '%s' for reading", dictfile);
        for (li = lineiter_start(fp); li; li = lineiter_next(li)) {
            if (li->buf[0] != '#')
                n++;
        }
        rewind(fp);
    }

    fp2 = NULL;
    if (fillerfile) {
        if ((fp2 = fopen(fillerfile, "r")) == NULL)
            E_FATAL_SYSTEM("Failed to open filler dictionary file '%s' for reading", fillerfile);
        for (li = lineiter_start(fp2); li; li = lineiter_next(li)) {
            if (li->buf[0] != '#')
                n++;
        }
        rewind(fp2);
    }

    /*
     * Allocate dict entries.  HACK!!  Allow some extra entries for words not in file.
     * Also check for type size restrictions.
     */
    d = (dict_t *) ckd_calloc(1, sizeof(dict_t));       /* freed in dict_free() */
    d->refcnt = 1;
    d->max_words =
        (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID;
    if (n >= MAX_S3WID)
        E_FATAL("#Words in dictionaries (%d) exceeds limit (%d)\n", n,
                MAX_S3WID);

    E_INFO("Allocating %d * %d bytes (%d KiB) for word entries\n",
           d->max_words, sizeof(dictword_t),
           d->max_words * sizeof(dictword_t) / 1024);
    d->word = (dictword_t *) ckd_calloc(d->max_words, sizeof(dictword_t));      /* freed in dict_free() */
    d->n_word = 0;
    if (mdef)
        d->mdef = bin_mdef_retain(mdef);

    /* Create new hash table for word strings; case-insensitive word strings */
    if (config && cmd_ln_exists_r(config, "-dictcase"))
        d->nocase = cmd_ln_boolean_r(config, "-dictcase");
    d->ht = hash_table_new(d->max_words, d->nocase);

    /* Digest main dictionary file */
    if (fp) {
        E_INFO("Reading main dictionary: %s\n", dictfile);
        dict_read(fp, d);
        fclose(fp);
        E_INFO("%d words read\n", d->n_word);
    }

    /* Now the filler dictionary file, if it exists */
    d->filler_start = d->n_word;
    if (fillerfile) {
        E_INFO("Reading filler dictionary: %s\n", fillerfile);
        dict_read(fp2, d);
        fclose(fp2);
        E_INFO("%d words read\n", d->n_word - d->filler_start);
    }
    if (mdef)
        sil = bin_mdef_silphone(mdef);
    else
        sil = 0;
    if (dict_wordid(d, S3_START_WORD) == BAD_S3WID) {
        dict_add_word(d, S3_START_WORD, &sil, 1);
    }
    if (dict_wordid(d, S3_FINISH_WORD) == BAD_S3WID) {
        dict_add_word(d, S3_FINISH_WORD, &sil, 1);
    }
    if (dict_wordid(d, S3_SILENCE_WORD) == BAD_S3WID) {
        dict_add_word(d, S3_SILENCE_WORD, &sil, 1);
    }

    d->filler_end = d->n_word - 1;

    /* Initialize distinguished word-ids */
    d->startwid = dict_wordid(d, S3_START_WORD);
    d->finishwid = dict_wordid(d, S3_FINISH_WORD);
    d->silwid = dict_wordid(d, S3_SILENCE_WORD);

    if ((d->filler_start > d->filler_end)
        || (!dict_filler_word(d, d->silwid)))
        E_FATAL("%s must occur (only) in filler dictionary\n",
                S3_SILENCE_WORD);

    /* No check that alternative pronunciations for filler words are in filler range!! */

    return d;
}
Esempio n. 6
0
static int32
dict_read(FILE * fp, dict_t * d)
{
    lineiter_t *li;
    char **wptr;
    s3cipid_t *p;
    int32 lineno, nwd;
    s3wid_t w;
    int32 i, maxwd;
    size_t stralloc, phnalloc;

    maxwd = 512;
    p = (s3cipid_t *) ckd_calloc(maxwd + 4, sizeof(*p));
    wptr = (char **) ckd_calloc(maxwd, sizeof(char *)); /* Freed below */

    lineno = 0;
    stralloc = phnalloc = 0;
    for (li = lineiter_start(fp); li; li = lineiter_next(li)) {
        lineno++;
        if (0 == strncmp(li->buf, "##", 2)
            || 0 == strncmp(li->buf, ";;", 2))
            continue;

        if ((nwd = str2words(li->buf, wptr, maxwd)) < 0) {
            /* Increase size of p, wptr. */
            nwd = str2words(li->buf, NULL, 0);
            assert(nwd > maxwd); /* why else would it fail? */
            maxwd = nwd;
            p = (s3cipid_t *) ckd_realloc(p, (maxwd + 4) * sizeof(*p));
            wptr = (char **) ckd_realloc(wptr, maxwd * sizeof(*wptr));
        }

        if (nwd == 0)           /* Empty line */
            continue;
        /* wptr[0] is the word-string and wptr[1..nwd-1] the pronunciation sequence */
        if (nwd == 1) {
            E_ERROR("Line %d: No pronunciation for word %s; ignored\n",
                    lineno, wptr[0]);
            continue;
        }


        /* Convert pronunciation string to CI-phone-ids */
        for (i = 1; i < nwd; i++) {
            p[i - 1] = dict_ciphone_id(d, wptr[i]);
            if (NOT_S3CIPID(p[i - 1])) {
                E_ERROR("Line %d: Bad ciphone: %s; word %s ignored\n",
                        lineno, wptr[i], wptr[0]);
                break;
            }
        }

        if (i == nwd) {         /* All CI-phones successfully converted to IDs */
            w = dict_add_word(d, wptr[0], p, nwd - 1);
            if (NOT_S3WID(w))
                E_ERROR
                    ("Line %d: dict_add_word (%s) failed (duplicate?); ignored\n",
                     lineno, wptr[0]);
            else {
                stralloc += strlen(d->word[w].word);
                phnalloc += d->word[w].pronlen * sizeof(s3cipid_t);
            }
        }
    }
    E_INFO("Allocated %d KiB for strings, %d KiB for phones\n",
           (int)stralloc / 1024, (int)phnalloc / 1024);
    ckd_free(p);
    ckd_free(wptr);

    return 0;
}
Esempio n. 7
0
static int
run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
{
    hash_table_t *files;
    hash_iter_t *itor;
    lineiter_t *li;
    FILE *ctlfh;
    int nskip, runlen, npart, rv = 0;

    if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
        E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
        return -1;
    }
    nskip = cmd_ln_int32_r(wtf->config, "-nskip");
    runlen = cmd_ln_int32_r(wtf->config, "-runlen");
    if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
        /* Count lines in the file. */
        int partlen, part, nlines = 0;
        part = cmd_ln_int32_r(wtf->config, "-part");
        for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
            ++nlines;
        fseek(ctlfh, 0, SEEK_SET);
        partlen = nlines / npart;
        nskip = partlen * (part - 1);
        if (part == npart)
            runlen = -1;
        else
            runlen = partlen;
    }
    if (runlen != -1){
        E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
        files = hash_table_new(runlen, HASH_CASE_YES);
    }
    else {
        E_INFO("Processing all remaining utterances at position %d\n", nskip);
        files = hash_table_new(1000, HASH_CASE_YES);
    }
    for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
        char *c, *infile, *outfile;

        if (nskip-- > 0)
            continue;
        if (runlen == 0) {
            lineiter_free(li);
            break;
        }
        --runlen;

        string_trim(li->buf, STRING_BOTH);
        /* Extract the file ID from the control line. */
        if ((c = strchr(li->buf, ' ')) != NULL)
            *c = '\0';
        if (strlen(li->buf) == 0) {
    	    E_WARN("Empty line %d in control file, skipping\n", li->lineno);
    	    continue;
        }
        build_filenames(wtf->config, li->buf, &infile, &outfile);
        if (hash_table_lookup(files, infile, NULL) == 0)
            continue;
        rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
        hash_table_enter(files, infile, outfile);
        if (rv != 0) {
            lineiter_free(li);
            break;
        }
    }
    for (itor = hash_table_iter(files); itor;
         itor = hash_table_iter_next(itor)) {
        ckd_free((void *)hash_entry_key(itor->ent));
        ckd_free(hash_entry_val(itor->ent));
    }
    hash_table_free(files);

    if (fclose(ctlfh) == EOF)
        E_ERROR_SYSTEM("Failed to close control file");
    return rv;
}
Esempio n. 8
0
int batch_decoder_run(batch_decoder_t *bd)
{
    int32 ctloffset, ctlcount, ctlincr;
    lineiter_t *li, *ali = NULL;

    search_run(bd->fwdtree);
    search_run(bd->fwdflat);

    ctloffset = cmd_ln_int32_r(bd->config, "-ctloffset");
    ctlcount = cmd_ln_int32_r(bd->config, "-ctlcount");
    ctlincr = cmd_ln_int32_r(bd->config, "-ctlincr");

    if (bd->alignfh)
        ali = lineiter_start(bd->alignfh);
    for (li = lineiter_start(bd->ctlfh); li; li = lineiter_next(li)) {
        alignment_t *al = NULL;
        char *wptr[4];
        int32 nf, sf, ef;

        if (li->lineno < ctloffset) {
            if (ali)
                ali = lineiter_next(ali);
            continue;
        }
        if ((li->lineno - ctloffset) % ctlincr != 0) {
            if (ali)
                ali = lineiter_next(ali);
            continue;
        }
        if (ctlcount != -1 && li->lineno >= ctloffset + ctlcount)
            break;
        if (ali)
            al = parse_alignment(ali->buf, search_factory_d2p(bd->sf));
        sf = 0;
        ef = -1;
        nf = str2words(li->buf, wptr, 4);
        if (nf == 0) {
            /* Do nothing. */
        }
        else if (nf < 0) {
            E_ERROR("Unexpected extra data in control file at line %d\n", li->lineno);
        }
        else
        {
            char *file, *uttid;
            file = wptr[0];
            uttid = NULL;
            if (nf > 1)
            sf = atoi(wptr[1]);
            if (nf > 2)
            ef = atoi(wptr[2]);
            if (nf > 3)
            uttid = wptr[3];
            /* Do actual decoding. */
            batch_decoder_decode(bd, file, uttid, sf, ef, al);
        }
        alignment_free(al);
        if (ali) ali = lineiter_next(ali);
    }
    featbuf_producer_shutdown(search_factory_featbuf(bd->sf));
    return 0;
}