static void feat_agc(feat_t *fcb, mfcc_t **mfc, int32 nfr, int32 beginutt, int32 endutt) { agc_type_t agc_type = fcb->agc; if (!(beginutt && endutt) && agc_type != AGC_NONE) /* Only agc_emax in block computation mode. */ agc_type = AGC_EMAX; switch (agc_type) { case AGC_MAX: agc_max(fcb->agc_struct, mfc, nfr); break; case AGC_EMAX: agc_emax(fcb->agc_struct, mfc, nfr); if (endutt) agc_emax_update(fcb->agc_struct); break; case AGC_NOISE: agc_noise(fcb->agc_struct, mfc, nfr); break; default: ; } cep_dump_dbg(fcb, mfc, nfr, "After AGC"); }
void agc(float32 *mfcc, uint32 n_frame) { const char *agc_type = cmd_ln_access("-agc"); uint32 i; if (strcmp(agc_type, "noise") == 0) { real_agc_noise(mfcc, n_frame, veclen); } else if (strcmp(agc_type, "max") == 0) { agc_max(mfcc, n_frame, veclen); } else if (strcmp(agc_type, "emax") == 0) { for (i = 0; i < n_frame; i++) { agc_emax_proc(&mfcc[i*veclen], &mfcc[i*veclen], veclen); } } else if (strcmp(agc_type, "none") == 0) { /* do nothing */ } else if (agc_type == NULL) { E_WARN("no agc set\n"); return ; } else { E_FATAL("unsupported agc type %s\n", agc_type); } }
/* * Find Viterbi alignment. */ static void align_utt (char *sent, /* In: Reference transcript */ float32 **mfc, /* In: MFC cepstra for input utterance */ int32 nfr, /* In: #frames of input */ char *ctlspec, /* In: Utt specifiction from control file */ char *uttid) /* In: Utterance id, for logging and other use */ { static float32 **feat = NULL; static int32 w; static int32 topn; static gauden_dist_t ***dist; static int32 *senscr; static s3senid_t *sen_active; static int8 *mgau_active; static char *s2stsegdir; static char *stsegdir; static char *phsegdir; static char *wdsegdir; int32 i, s, sid, gid, n_sen_active, best; char *arg; align_stseg_t *stseg; align_phseg_t *phseg; align_wdseg_t *wdseg; if (! feat) { /* One-time allocation of necessary intermediate variables */ /* Allocate space for a feature vector */ feat = (float32 **) ckd_calloc (n_feat, sizeof(float32 *)); for (i = 0; i < n_feat; i++) feat[i] = (float32 *) ckd_calloc (featlen[i], sizeof(float32)); /* Allocate space for top-N codeword density values in a codebook */ w = feat_window_size (); /* #MFC vectors needed on either side of current frame to compute one feature vector */ topn = *((int32 *) cmd_ln_access("-topn")); if (topn > g->n_density) { E_ERROR("-topn argument (%d) > #density codewords (%d); set to latter\n", topn, g->n_density); topn = g->n_density; } dist = (gauden_dist_t ***) ckd_calloc_3d (g->n_mgau, n_feat, topn, sizeof(gauden_dist_t)); /* Space for one frame of senone scores, and per frame active flags */ senscr = (int32 *) ckd_calloc (sen->n_sen, sizeof(int32)); sen_active = (s3senid_t *) ckd_calloc (sen->n_sen, sizeof(s3senid_t)); mgau_active = (int8 *) ckd_calloc (g->n_mgau, sizeof(int8)); /* Note various output directories */ s2stsegdir = NULL; stsegdir = NULL; phsegdir = NULL; wdsegdir = NULL; if ((arg = (char *) cmd_ln_access ("-s2stsegdir")) != NULL) s2stsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-stsegdir")) != NULL) stsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-phsegdir")) != NULL) phsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-wdsegdir")) != NULL) wdsegdir = (char *) ckd_salloc (arg); } /* HACK HACKA HACK BHIKSHA if (nfr <= (w<<1)) { E_ERROR("Utterance %s < %d frames (%d); ignored\n", uttid, (w<<1)+1, nfr); return; } END HACK HACKA HACK */ cyctimer_reset_all (); counter_reset_all (); timing_reset (tm_utt); timing_start (tm_utt); cyctimer_resume (tmr_utt); /* AGC and CMN */ arg = (char *) cmd_ln_access ("-cmn"); if (strcmp (arg, "current") == 0) norm_mean (mfc-4, nfr+8, cepsize); /* -4 HACKA HACK */ arg = (char *) cmd_ln_access ("-agc"); if (strcmp (arg, "max") == 0) agc_max (mfc, nfr); if (align_build_sent_hmm (sent) != 0) { align_destroy_sent_hmm (); cyctimer_pause (tmr_utt); E_ERROR("No sentence HMM; no alignment for %s\n", uttid); return; } align_start_utt (uttid); /* * A feature vector for frame f depends on input MFC vectors [f-w..f+w]. Hence * the feature vector corresponding to the first w and last w input frames is * undefined. We define them by simply replicating the first and last true * feature vectors (presumably silence regions). */ for (i = 0; i < nfr; i++) { cyctimer_resume (tmr_utt); /* Compute feature vector for current frame from input speech cepstra */ /* HACK HACKA HACK BHIKSHA if (i < w) feat_cep2feat (mfc+w, feat); else if (i >= nfr-w) feat_cep2feat (mfc+(nfr-w-1), feat); else END HACK HACKA HACK */ feat_cep2feat (mfc+i, feat); /* * Evaluate gaussian density codebooks and senone scores for input codeword. * Evaluate only active codebooks and senones. */ /* Obtain active senone flags */ cyctimer_resume (tmr_senone); align_sen_active (sen_active, sen->n_sen); /* Flag all CI senones to active if interpolating */ if (interp) { for (s = 0; s < mdef->n_ci_sen; s++) sen_active[s] = 1; } /* Turn active flags into list (for faster access) */ n_sen_active = 0; for (s = 0; s < mdef->n_sen; s++) { if (sen_active[s]) sen_active[n_sen_active++] = s; } cyctimer_pause (tmr_senone); /* Flag all active mixture-gaussian codebooks */ cyctimer_resume (tmr_gauden); for (gid = 0; gid < g->n_mgau; gid++) mgau_active[gid] = 0; for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; mgau_active[sen->mgau[sid]] = 1; } /* Compute topn gaussian density values (for active codebooks) */ for (gid = 0; gid < g->n_mgau; gid++) if (mgau_active[gid]) gauden_dist (g, gid, topn, feat, dist[gid]); cyctimer_pause (tmr_gauden); /* Evaluate active senones */ cyctimer_resume (tmr_senone); best = (int32) 0x80000000; for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; senscr[sid] = senone_eval (sen, sid, dist[sen->mgau[sid]], topn); if (best < senscr[sid]) best = senscr[sid]; } if (interp) { for (s = 0; s < n_sen_active; s++) { if ((sid = sen_active[s]) >= mdef->n_ci_sen) interp_cd_ci (interp, senscr, sid, mdef->cd2cisen[sid]); } } /* Normalize senone scores (interpolation above can only lower best score) */ for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; senscr[sid] -= best; } senscale[i] = best; cyctimer_pause (tmr_senone); /* Step alignment one frame forward */ cyctimer_resume (tmr_align); align_frame (senscr); cyctimer_pause (tmr_align); cyctimer_pause (tmr_utt); } timing_stop (tm_utt); printf ("\n"); /* Wind up alignment for this utterance */ if (align_end_utt (&stseg, &phseg, &wdseg) < 0) E_ERROR("Final state not reached; no alignment for %s\n\n", uttid); else { if (s2stsegdir) write_s2stseg (s2stsegdir, stseg, uttid, ctlspec); if (stsegdir) write_stseg (stsegdir, stseg, uttid, ctlspec); if (phsegdir) write_phseg (phsegdir, phseg, uttid, ctlspec); if (wdsegdir) write_wdseg (wdsegdir, wdseg, uttid, ctlspec); if (outsentfp) write_outsent (outsentfp, wdseg, uttid); } align_destroy_sent_hmm (); cyctimer_print_all_norm (stdout, nfr*0.01, tmr_utt); counter_print_all (stdout); printf("EXECTIME: %5d frames, %7.2f sec CPU, %6.2f xRT; %7.2f sec elapsed, %6.2f xRT\n", nfr, tm_utt->t_cpu, tm_utt->t_cpu * 100.0 / nfr, tm_utt->t_elapsed, tm_utt->t_elapsed * 100.0 / nfr); tot_nfr += nfr; }
static void decode_utt (void *data, char *uttfile, int32 sf, int32 ef, char *uttid) { kb_t *kb; acoustic_t *am; int32 featwin, nfr, min_utt_frames, n_vithist; char cepfile[4096], latfile[4096]; vithist_t *finalhist; int32 i, f; glist_t hyplist; FILE *latfp; printf ("\n"); fflush (stdout); E_INFO("Utterance %s\n", uttid); kb = (kb_t *)data; am = kb->am; featwin = feat_window_size(am->fcb); /* Build complete cepfile name and read cepstrum data; check for min length */ ctl_infile (cepfile, cmd_ln_str("-cepdir"), cmd_ln_str("-cepext"), uttfile); if ((nfr = s2mfc_read (cepfile, sf, ef, featwin, am->mfc, S3_MAX_FRAMES)) < 0) { E_ERROR("%s: MFC read failed\n", uttid); return; } E_INFO("%s: %d frames\n", uttid, nfr-(featwin<<1)); ptmr_reset (kb->tm); ptmr_reset (kb->tm_search); ptmr_start (kb->tm); min_utt_frames = (featwin<<1) + 1; if (nfr < min_utt_frames) { E_ERROR("%s: Utterance shorter than %d frames; ignored\n", uttid, min_utt_frames, nfr); return; } /* CMN/AGC */ if (strcmp (cmd_ln_str("-cmn"), "current") == 0) cmn (am->mfc, nfr, feat_cepsize(am->fcb)); if (strcmp (cmd_ln_str("-agc"), "max") == 0) agc_max (am->mfc, nfr); /* Process utterance */ lextree_vit_start (kb, uttid); for (i = featwin, f = 0; i < nfr-featwin; i++, f++) { am->senscale[f] = acoustic_eval (am, i); ptmr_start (kb->tm_search); lextree_vit_frame (kb, f, uttid); printf (" %d,%d,%d", f, glist_count (kb->vithist[f]), glist_count (kb->lextree_active)); fflush (stdout); ptmr_stop (kb->tm_search); } printf ("\n"); finalhist = lextree_vit_end (kb, f, uttid); hyplist = vithist_backtrace (finalhist, kb->am->senscale); hyp_log (stdout, hyplist, _dict_wordstr, (void *)kb->dict); hyp_myfree (hyplist); printf ("\n"); /* Log the entire Viterbi word lattice */ sprintf (latfile, "%s.lat", uttid); if ((latfp = fopen(latfile, "w")) == NULL) { E_ERROR("fopen(%s,w) failed; using stdout\n", latfile); latfp = stdout; } n_vithist = vithist_log (latfp, kb->vithist, f, _dict_wordstr, (void *)kb->dict); if (latfp != stdout) fclose (latfp); else { printf ("\n"); fflush (stdout); } ptmr_stop (kb->tm); if (f > 0) { printf("TMR(%s): %5d frames; %.1fs CPU, %.2f xRT; %.1fs CPU(search), %.2f xRT; %.1fs Elapsed, %.2f xRT\n", uttid, f, kb->tm->t_cpu, kb->tm->t_cpu * 100.0 / f, kb->tm_search->t_cpu, kb->tm_search->t_cpu * 100.0 / f, kb->tm->t_elapsed, kb->tm->t_elapsed * 100.0 / f); printf("CTR(%s): %5d frames; %d Sen (%.1f/fr); %d HMM (%.1f/fr); %d Words (%.1f/fr)\n", uttid, f, kb->n_sen_eval, ((float64)kb->n_sen_eval) / f, kb->n_hmm_eval, ((float64)kb->n_hmm_eval) / f, n_vithist, ((float64) n_vithist) / f); } /* Cleanup */ glist_free (kb->lextree_active); kb->lextree_active = NULL; for (; f >= -1; --f) { /* I.e., including dummy START_WORD node at frame -1 */ glist_myfree (kb->vithist[f], sizeof(vithist_t)); kb->vithist[f] = NULL; } lm_cache_reset (kb->lm); }