/* * Find Viterbi alignment. */ static void align_utt (char *sent, /* In: Reference transcript */ float32 **mfc, /* In: MFC cepstra for input utterance */ int32 nfr, /* In: #frames of input */ char *ctlspec, /* In: Utt specifiction from control file */ char *uttid) /* In: Utterance id, for logging and other use */ { static float32 **feat = NULL; static int32 w; static int32 topn; static gauden_dist_t ***dist; static int32 *senscr; static s3senid_t *sen_active; static int8 *mgau_active; static char *s2stsegdir; static char *stsegdir; static char *phsegdir; static char *wdsegdir; int32 i, s, sid, gid, n_sen_active, best; char *arg; align_stseg_t *stseg; align_phseg_t *phseg; align_wdseg_t *wdseg; if (! feat) { /* One-time allocation of necessary intermediate variables */ /* Allocate space for a feature vector */ feat = (float32 **) ckd_calloc (n_feat, sizeof(float32 *)); for (i = 0; i < n_feat; i++) feat[i] = (float32 *) ckd_calloc (featlen[i], sizeof(float32)); /* Allocate space for top-N codeword density values in a codebook */ w = feat_window_size (); /* #MFC vectors needed on either side of current frame to compute one feature vector */ topn = *((int32 *) cmd_ln_access("-topn")); if (topn > g->n_density) { E_ERROR("-topn argument (%d) > #density codewords (%d); set to latter\n", topn, g->n_density); topn = g->n_density; } dist = (gauden_dist_t ***) ckd_calloc_3d (g->n_mgau, n_feat, topn, sizeof(gauden_dist_t)); /* Space for one frame of senone scores, and per frame active flags */ senscr = (int32 *) ckd_calloc (sen->n_sen, sizeof(int32)); sen_active = (s3senid_t *) ckd_calloc (sen->n_sen, sizeof(s3senid_t)); mgau_active = (int8 *) ckd_calloc (g->n_mgau, sizeof(int8)); /* Note various output directories */ s2stsegdir = NULL; stsegdir = NULL; phsegdir = NULL; wdsegdir = NULL; if ((arg = (char *) cmd_ln_access ("-s2stsegdir")) != NULL) s2stsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-stsegdir")) != NULL) stsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-phsegdir")) != NULL) phsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-wdsegdir")) != NULL) wdsegdir = (char *) ckd_salloc (arg); } /* HACK HACKA HACK BHIKSHA if (nfr <= (w<<1)) { E_ERROR("Utterance %s < %d frames (%d); ignored\n", uttid, (w<<1)+1, nfr); return; } END HACK HACKA HACK */ cyctimer_reset_all (); counter_reset_all (); timing_reset (tm_utt); timing_start (tm_utt); cyctimer_resume (tmr_utt); /* AGC and CMN */ arg = (char *) cmd_ln_access ("-cmn"); if (strcmp (arg, "current") == 0) norm_mean (mfc-4, nfr+8, cepsize); /* -4 HACKA HACK */ arg = (char *) cmd_ln_access ("-agc"); if (strcmp (arg, "max") == 0) agc_max (mfc, nfr); if (align_build_sent_hmm (sent) != 0) { align_destroy_sent_hmm (); cyctimer_pause (tmr_utt); E_ERROR("No sentence HMM; no alignment for %s\n", uttid); return; } align_start_utt (uttid); /* * A feature vector for frame f depends on input MFC vectors [f-w..f+w]. Hence * the feature vector corresponding to the first w and last w input frames is * undefined. We define them by simply replicating the first and last true * feature vectors (presumably silence regions). */ for (i = 0; i < nfr; i++) { cyctimer_resume (tmr_utt); /* Compute feature vector for current frame from input speech cepstra */ /* HACK HACKA HACK BHIKSHA if (i < w) feat_cep2feat (mfc+w, feat); else if (i >= nfr-w) feat_cep2feat (mfc+(nfr-w-1), feat); else END HACK HACKA HACK */ feat_cep2feat (mfc+i, feat); /* * Evaluate gaussian density codebooks and senone scores for input codeword. * Evaluate only active codebooks and senones. */ /* Obtain active senone flags */ cyctimer_resume (tmr_senone); align_sen_active (sen_active, sen->n_sen); /* Flag all CI senones to active if interpolating */ if (interp) { for (s = 0; s < mdef->n_ci_sen; s++) sen_active[s] = 1; } /* Turn active flags into list (for faster access) */ n_sen_active = 0; for (s = 0; s < mdef->n_sen; s++) { if (sen_active[s]) sen_active[n_sen_active++] = s; } cyctimer_pause (tmr_senone); /* Flag all active mixture-gaussian codebooks */ cyctimer_resume (tmr_gauden); for (gid = 0; gid < g->n_mgau; gid++) mgau_active[gid] = 0; for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; mgau_active[sen->mgau[sid]] = 1; } /* Compute topn gaussian density values (for active codebooks) */ for (gid = 0; gid < g->n_mgau; gid++) if (mgau_active[gid]) gauden_dist (g, gid, topn, feat, dist[gid]); cyctimer_pause (tmr_gauden); /* Evaluate active senones */ cyctimer_resume (tmr_senone); best = (int32) 0x80000000; for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; senscr[sid] = senone_eval (sen, sid, dist[sen->mgau[sid]], topn); if (best < senscr[sid]) best = senscr[sid]; } if (interp) { for (s = 0; s < n_sen_active; s++) { if ((sid = sen_active[s]) >= mdef->n_ci_sen) interp_cd_ci (interp, senscr, sid, mdef->cd2cisen[sid]); } } /* Normalize senone scores (interpolation above can only lower best score) */ for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; senscr[sid] -= best; } senscale[i] = best; cyctimer_pause (tmr_senone); /* Step alignment one frame forward */ cyctimer_resume (tmr_align); align_frame (senscr); cyctimer_pause (tmr_align); cyctimer_pause (tmr_utt); } timing_stop (tm_utt); printf ("\n"); /* Wind up alignment for this utterance */ if (align_end_utt (&stseg, &phseg, &wdseg) < 0) E_ERROR("Final state not reached; no alignment for %s\n\n", uttid); else { if (s2stsegdir) write_s2stseg (s2stsegdir, stseg, uttid, ctlspec); if (stsegdir) write_stseg (stsegdir, stseg, uttid, ctlspec); if (phsegdir) write_phseg (phsegdir, phseg, uttid, ctlspec); if (wdsegdir) write_wdseg (wdsegdir, wdseg, uttid, ctlspec); if (outsentfp) write_outsent (outsentfp, wdseg, uttid); } align_destroy_sent_hmm (); cyctimer_print_all_norm (stdout, nfr*0.01, tmr_utt); counter_print_all (stdout); printf("EXECTIME: %5d frames, %7.2f sec CPU, %6.2f xRT; %7.2f sec elapsed, %6.2f xRT\n", nfr, tm_utt->t_cpu, tm_utt->t_cpu * 100.0 / nfr, tm_utt->t_elapsed, tm_utt->t_elapsed * 100.0 / nfr); tot_nfr += nfr; }
int32 ms_cont_mgau_frame_eval(ps_mgau_t * mg, int16 *senscr, uint8 *senone_active, int32 n_senone_active, mfcc_t ** feat, int32 frame, int32 compallsen) { ms_mgau_model_t *msg = (ms_mgau_model_t *)mg; int32 gid; int32 topn; int32 best; gauden_t *g; senone_t *sen; topn = ms_mgau_topn(msg); g = ms_mgau_gauden(msg); sen = ms_mgau_senone(msg); if (compallsen) { int32 s; for (gid = 0; gid < g->n_mgau; gid++) gauden_dist(g, gid, topn, feat, msg->dist[gid]); best = (int32) 0x7fffffff; for (s = 0; s <(int) sen->n_sen; s++) { senscr[s] = senone_eval(sen, s, msg->dist[sen->mgau[s]], topn); if (best > senscr[s]) { best = senscr[s]; } } /* Normalize senone scores */ for (s = 0; s <(int) sen->n_sen; s++) { int32 bs = senscr[s] - best; if (bs > 32767) bs = 32767; if (bs < -32768) bs = -32768; senscr[s] = bs; } } else { int32 i, n; /* Flag all active mixture-gaussian codebooks */ for (gid = 0; gid < g->n_mgau; gid++) msg->mgau_active[gid] = 0; n = 0; for (i = 0; i < n_senone_active; i++) { /* senone_active consists of deltas. */ int32 s = senone_active[i] + n; msg->mgau_active[sen->mgau[s]] = 1; n = s; } /* Compute topn gaussian density values (for active codebooks) */ for (gid = 0; gid < g->n_mgau; gid++) { if (msg->mgau_active[gid]) gauden_dist(g, gid, topn, feat, msg->dist[gid]); } best = (int32) 0x7fffffff; n = 0; for (i = 0; i < n_senone_active; i++) { int32 s = senone_active[i] + n; senscr[s] = senone_eval(sen, s, msg->dist[sen->mgau[s]], topn); if (best > senscr[s]) { best = senscr[s]; } n = s; } /* Normalize senone scores */ n = 0; for (i = 0; i < n_senone_active; i++) { int32 s = senone_active[i] + n; int32 bs = senscr[s] - best; if (bs > 32767) bs = 32767; if (bs < -32768) bs = -32768; senscr[s] = bs; n = s; } } return 0; }
int32 acoustic_eval (acoustic_t *am, int32 frm) { senone_t *sen; gauden_t *gau; int32 m, f, s, best, bestgau; int32 i, j, k; float32 **fv; float32 **mfc; sen = am->sen; gau = am->gau; if (am->mfc) { mfc = am->mfc + frm; am->fcb->compute_feat(am->fcb, mfc, am->feat[0]); fv = am->feat[0]; } else { fv = am->feat[frm]; } /* Identify the active mixture Gaussians */ if (senactive_to_mgauactive(am) == 0) E_FATAL("No states active\n"); /* Since we're going to accumulate into senscr for each feature stream */ memset (am->senscr, 0, sen->n_sen * sizeof(int32)); /* Evaluate all senones; FOR NOW NOT YET OPTIMIZED TO JUST THE ACTIVE ONES */ best = MAX_NEG_INT32; for (m = 0; m < gau->n_mgau; m++) { if (bitvec_is_set (am->gauden_active, m)) { for (f = 0; f < gau->n_feat; f++) { k = gauden_dist (gau, m, f, fv[f], am->dist); if (am->dist_valid) { /* Determine set of active mgau components based on pruning beam */ bestgau = MAX_NEG_INT32; for (i = 0; i < k; i++) if (am->dist[i] > bestgau) bestgau = am->dist[i]; j = 0; for (i = 0; i < k; i++) { if (am->dist[i] >= bestgau + am->mgaubeam) { am->dist_valid[j++] = i; } } am->n_dist_valid = j; k = j; am->tot_dist_valid += j; am->tot_mgau_eval += 1; } #if 1 for (i = 0; i < sen->mgau2sen[m].n_sen; i++) { s = sen->mgau2sen[m].sen[i]; if (bitvec_is_set (am->sen_active, s)) am->senscr[s] += senone_eval (sen, s, f, am->dist, am->dist_valid, k); } #else senone_eval_all (sen, m, f, am->dist, am->dist_valid, k, am->senscr); #endif } /* Find the best senone score so far */ for (i = 0; i < sen->mgau2sen[m].n_sen; i++) { s = sen->mgau2sen[m].sen[i]; if (bitvec_is_set (am->sen_active, s) && (am->senscr[s] > best)) best = am->senscr[s]; } } } /* CD-CI likelihood-interpolation (later) */ /* Normalize senone score by subtracting the best */ for (s = 0; s < sen->n_sen; s++) if (bitvec_is_set (am->sen_active, s)) am->senscr[s] -= best; return best; }