/*
 * Find Viterbi alignment.
 */
static void align_utt (char *sent,	/* In: Reference transcript */
		       float32 **mfc,	/* In: MFC cepstra for input utterance */
		       int32 nfr,	/* In: #frames of input */
		       char *ctlspec,	/* In: Utt specifiction from control file */
		       char *uttid)	/* In: Utterance id, for logging and other use */
{
    static float32 **feat = NULL;
    static int32 w;
    static int32 topn;
    static gauden_dist_t ***dist;
    static int32 *senscr;
    static s3senid_t *sen_active;
    static int8 *mgau_active;
    static char *s2stsegdir;
    static char *stsegdir;
    static char *phsegdir;
    static char *wdsegdir;
    
    int32 i, s, sid, gid, n_sen_active, best;
    char *arg;
    align_stseg_t *stseg;
    align_phseg_t *phseg;
    align_wdseg_t *wdseg;

    if (! feat) {
	/* One-time allocation of necessary intermediate variables */

	/* Allocate space for a feature vector */
	feat = (float32 **) ckd_calloc (n_feat, sizeof(float32 *));
	for (i = 0; i < n_feat; i++)
	    feat[i] = (float32 *) ckd_calloc (featlen[i], sizeof(float32));
	
	/* Allocate space for top-N codeword density values in a codebook */
	w = feat_window_size ();	/* #MFC vectors needed on either side of current
					   frame to compute one feature vector */
	topn = *((int32 *) cmd_ln_access("-topn"));
	if (topn > g->n_density) {
	    E_ERROR("-topn argument (%d) > #density codewords (%d); set to latter\n",
		   topn, g->n_density);
	    topn = g->n_density;
	}
	dist = (gauden_dist_t ***) ckd_calloc_3d (g->n_mgau, n_feat, topn,
						  sizeof(gauden_dist_t));
	
	/* Space for one frame of senone scores, and per frame active flags */
	senscr = (int32 *) ckd_calloc (sen->n_sen, sizeof(int32));
	sen_active = (s3senid_t *) ckd_calloc (sen->n_sen, sizeof(s3senid_t));
	mgau_active = (int8 *) ckd_calloc (g->n_mgau, sizeof(int8));

	/* Note various output directories */
	s2stsegdir = NULL;
	stsegdir = NULL;
	phsegdir = NULL;
	wdsegdir = NULL;
	if ((arg = (char *) cmd_ln_access ("-s2stsegdir")) != NULL)
	    s2stsegdir = (char *) ckd_salloc (arg);
	if ((arg = (char *) cmd_ln_access ("-stsegdir")) != NULL)
	    stsegdir = (char *) ckd_salloc (arg);
	if ((arg = (char *) cmd_ln_access ("-phsegdir")) != NULL)
	    phsegdir = (char *) ckd_salloc (arg);
	if ((arg = (char *) cmd_ln_access ("-wdsegdir")) != NULL)
	    wdsegdir = (char *) ckd_salloc (arg);
    }
    
/* HACK HACKA HACK BHIKSHA 
    if (nfr <= (w<<1)) {
	E_ERROR("Utterance %s < %d frames (%d); ignored\n", uttid, (w<<1)+1, nfr);
	return;
    }
 END HACK HACKA HACK */
    
    cyctimer_reset_all ();
    counter_reset_all ();
    
    timing_reset (tm_utt);
    timing_start (tm_utt);
    cyctimer_resume (tmr_utt);

    /* AGC and CMN */
    arg = (char *) cmd_ln_access ("-cmn");
    if (strcmp (arg, "current") == 0)
	norm_mean (mfc-4, nfr+8, cepsize); /* -4 HACKA HACK */
    arg = (char *) cmd_ln_access ("-agc");
    if (strcmp (arg, "max") == 0)
	agc_max (mfc, nfr);
    
    if (align_build_sent_hmm (sent) != 0) {
	align_destroy_sent_hmm ();
	cyctimer_pause (tmr_utt);

	E_ERROR("No sentence HMM; no alignment for %s\n", uttid);
	
	return;
    }
    
    align_start_utt (uttid);
    
    /*
     * A feature vector for frame f depends on input MFC vectors [f-w..f+w].  Hence
     * the feature vector corresponding to the first w and last w input frames is
     * undefined.  We define them by simply replicating the first and last true
     * feature vectors (presumably silence regions).
     */
    for (i = 0; i < nfr; i++) {
	cyctimer_resume (tmr_utt);
	
	/* Compute feature vector for current frame from input speech cepstra */
/* HACK HACKA HACK BHIKSHA 
	if (i < w)
	    feat_cep2feat (mfc+w, feat);
	else if (i >= nfr-w)
	    feat_cep2feat (mfc+(nfr-w-1), feat);
	else
END HACK HACKA HACK */
	    feat_cep2feat (mfc+i, feat);

	/*
	 * Evaluate gaussian density codebooks and senone scores for input codeword.
	 * Evaluate only active codebooks and senones.
	 */
	/* Obtain active senone flags */
	cyctimer_resume (tmr_senone);
	align_sen_active (sen_active, sen->n_sen);
	/* Flag all CI senones to active if interpolating */
	if (interp) {
	    for (s = 0; s < mdef->n_ci_sen; s++)
		sen_active[s] = 1;
	}
	/* Turn active flags into list (for faster access) */
	n_sen_active = 0;
	for (s = 0; s < mdef->n_sen; s++) {
	    if (sen_active[s])
		sen_active[n_sen_active++] = s;
	}
	cyctimer_pause (tmr_senone);
	
	/* Flag all active mixture-gaussian codebooks */
	cyctimer_resume (tmr_gauden);
	for (gid = 0; gid < g->n_mgau; gid++)
	    mgau_active[gid] = 0;
	for (s = 0; s < n_sen_active; s++) {
	    sid = sen_active[s];
	    mgau_active[sen->mgau[sid]] = 1;
	}
	
	/* Compute topn gaussian density values (for active codebooks) */
	for (gid = 0; gid < g->n_mgau; gid++)
	    if (mgau_active[gid])
		gauden_dist (g, gid, topn, feat, dist[gid]);
	cyctimer_pause (tmr_gauden);
	
	/* Evaluate active senones */
	cyctimer_resume (tmr_senone);
	best = (int32) 0x80000000;
	for (s = 0; s < n_sen_active; s++) {
	    sid = sen_active[s];
	    senscr[sid] = senone_eval (sen, sid, dist[sen->mgau[sid]], topn);
	    if (best < senscr[sid])
		best = senscr[sid];
	}
	if (interp) {
	    for (s = 0; s < n_sen_active; s++) {
		if ((sid = sen_active[s]) >= mdef->n_ci_sen)
		    interp_cd_ci (interp, senscr, sid, mdef->cd2cisen[sid]);
	    }
	}
	
	/* Normalize senone scores (interpolation above can only lower best score) */
	for (s = 0; s < n_sen_active; s++) {
	    sid = sen_active[s];
	    senscr[sid] -= best;
	}
	senscale[i] = best;
	cyctimer_pause (tmr_senone);
	
	/* Step alignment one frame forward */
	cyctimer_resume (tmr_align);
	align_frame (senscr);
	cyctimer_pause (tmr_align);
	
	cyctimer_pause (tmr_utt);
    }
    timing_stop (tm_utt);

    printf ("\n");

    /* Wind up alignment for this utterance */
    if (align_end_utt (&stseg, &phseg, &wdseg) < 0)
	E_ERROR("Final state not reached; no alignment for %s\n\n", uttid);
    else {
	if (s2stsegdir)
	    write_s2stseg (s2stsegdir, stseg, uttid, ctlspec);
	if (stsegdir)
	    write_stseg (stsegdir, stseg, uttid, ctlspec);
	if (phsegdir)
	    write_phseg (phsegdir, phseg, uttid, ctlspec);
	if (wdsegdir)
	    write_wdseg (wdsegdir, wdseg, uttid, ctlspec);
	if (outsentfp)
	    write_outsent (outsentfp, wdseg, uttid);
    }
    
    align_destroy_sent_hmm ();
    
    cyctimer_print_all_norm (stdout, nfr*0.01, tmr_utt);
    counter_print_all (stdout);

    printf("EXECTIME: %5d frames, %7.2f sec CPU, %6.2f xRT; %7.2f sec elapsed, %6.2f xRT\n",
	   nfr,
	   tm_utt->t_cpu, tm_utt->t_cpu * 100.0 / nfr,
	   tm_utt->t_elapsed, tm_utt->t_elapsed * 100.0 / nfr);

    tot_nfr += nfr;
}
int32
ms_cont_mgau_frame_eval(ps_mgau_t * mg,
			int16 *senscr,
			uint8 *senone_active,
			int32 n_senone_active,
                        mfcc_t ** feat,
			int32 frame,
			int32 compallsen)
{
    ms_mgau_model_t *msg = (ms_mgau_model_t *)mg;
    int32 gid;
    int32 topn;
    int32 best;
    gauden_t *g;
    senone_t *sen;

    topn = ms_mgau_topn(msg);
    g = ms_mgau_gauden(msg);
    sen = ms_mgau_senone(msg);

    if (compallsen) {
	int32 s;

	for (gid = 0; gid < g->n_mgau; gid++)
	    gauden_dist(g, gid, topn, feat, msg->dist[gid]);

	best = (int32) 0x7fffffff;
	for (s = 0; s <(int) sen->n_sen; s++) {
	    senscr[s] = senone_eval(sen, s, msg->dist[sen->mgau[s]], topn);
	    if (best > senscr[s]) {
		best = senscr[s];
	    }
	}

	/* Normalize senone scores */
	for (s = 0; s <(int) sen->n_sen; s++) {
	    int32 bs = senscr[s] - best;
	    if (bs > 32767)
		bs = 32767;
	    if (bs < -32768)
		bs = -32768;
	    senscr[s] = bs;
	}
    }
    else {
	int32 i, n;
	/* Flag all active mixture-gaussian codebooks */
	for (gid = 0; gid < g->n_mgau; gid++)
	    msg->mgau_active[gid] = 0;

	n = 0;
	for (i = 0; i < n_senone_active; i++) {
	    /* senone_active consists of deltas. */
	    int32 s = senone_active[i] + n;
	    msg->mgau_active[sen->mgau[s]] = 1;
	    n = s;
	}

	/* Compute topn gaussian density values (for active codebooks) */
	for (gid = 0; gid < g->n_mgau; gid++) {
	    if (msg->mgau_active[gid])
		gauden_dist(g, gid, topn, feat, msg->dist[gid]);
	}

	best = (int32) 0x7fffffff;
	n = 0;
	for (i = 0; i < n_senone_active; i++) {
	    int32 s = senone_active[i] + n;
	    senscr[s] = senone_eval(sen, s, msg->dist[sen->mgau[s]], topn);
	    if (best > senscr[s]) {
		best = senscr[s];
	    }
	    n = s;
	}

	/* Normalize senone scores */
	n = 0;
	for (i = 0; i < n_senone_active; i++) {
	    int32 s = senone_active[i] + n;
	    int32 bs = senscr[s] - best;
	    if (bs > 32767)
		bs = 32767;
	    if (bs < -32768)
		bs = -32768;
	    senscr[s] = bs;
	    n = s;
	}
    }

    return 0;
}
Beispiel #3
0
int32 acoustic_eval (acoustic_t *am, int32 frm)
{
    senone_t *sen;
    gauden_t *gau;
    int32 m, f, s, best, bestgau;
    int32 i, j, k;
    float32 **fv;
    float32 **mfc;

    sen = am->sen;
    gau = am->gau;

    if (am->mfc) {
        mfc = am->mfc + frm;
        am->fcb->compute_feat(am->fcb, mfc, am->feat[0]);
        fv = am->feat[0];
    } else {
        fv = am->feat[frm];
    }

    /* Identify the active mixture Gaussians */
    if (senactive_to_mgauactive(am) == 0)
        E_FATAL("No states active\n");

    /* Since we're going to accumulate into senscr for each feature stream */
    memset (am->senscr, 0, sen->n_sen * sizeof(int32));

    /* Evaluate all senones; FOR NOW NOT YET OPTIMIZED TO JUST THE ACTIVE ONES */
    best = MAX_NEG_INT32;
    for (m = 0; m < gau->n_mgau; m++) {
        if (bitvec_is_set (am->gauden_active, m)) {
            for (f = 0; f < gau->n_feat; f++) {
                k = gauden_dist (gau, m, f, fv[f], am->dist);

                if (am->dist_valid) {
                    /* Determine set of active mgau components based on pruning beam */
                    bestgau = MAX_NEG_INT32;
                    for (i = 0; i < k; i++)
                        if (am->dist[i] > bestgau)
                            bestgau = am->dist[i];
                    j = 0;
                    for (i = 0; i < k; i++) {
                        if (am->dist[i] >= bestgau + am->mgaubeam) {
                            am->dist_valid[j++] = i;
                        }
                    }
                    am->n_dist_valid = j;
                    k = j;

                    am->tot_dist_valid += j;
                    am->tot_mgau_eval += 1;
                }
#if 1
                for (i = 0; i < sen->mgau2sen[m].n_sen; i++) {
                    s = sen->mgau2sen[m].sen[i];
                    if (bitvec_is_set (am->sen_active, s))
                        am->senscr[s] += senone_eval (sen, s, f, am->dist,
                                                      am->dist_valid, k);
                }
#else
                senone_eval_all (sen, m, f, am->dist, am->dist_valid, k, am->senscr);
#endif
            }

            /* Find the best senone score so far */
            for (i = 0; i < sen->mgau2sen[m].n_sen; i++) {
                s = sen->mgau2sen[m].sen[i];
                if (bitvec_is_set (am->sen_active, s) && (am->senscr[s] > best))
                    best = am->senscr[s];
            }
        }
    }

    /* CD-CI likelihood-interpolation (later) */

    /* Normalize senone score by subtracting the best */
    for (s = 0; s < sen->n_sen; s++)
        if (bitvec_is_set (am->sen_active, s))
            am->senscr[s] -= best;

    return best;
}