Esempio n. 1
0
int
agg_phn_seg(lexicon_t *lex,
	    acmod_set_t *acmod_set,
	    feat_t *fcb,
	    segdmp_type_t type)
{
    uint16 *seg;
    vector_t *mfcc;
    vector_t **feat;
    int32 n_frame;
    uint32 tick_cnt;

    acmod_id_t *phone;
    uint32 *start;
    uint32 *len;
    uint32 n_phone;
    uint32 s;
    char *btw_mark;

    char *trans;
    char **word;
    uint32 n_word;
    int32 mfc_veclen = cmd_ln_int32("-ceplen");

    uint32 n_stream;
    uint32 *veclen;

    tick_cnt = 0;

    n_stream = feat_dimension1(fcb);
    veclen = feat_stream_lengths(fcb);

    while (corpus_next_utt()) {
	if ((++tick_cnt % 500) == 0) {
	    E_INFOCONT("[%u] ", tick_cnt);
	}

	if (corpus_get_sent(&trans) != S3_SUCCESS) {
	    E_FATAL("Unable to read word transcript for %s\n", corpus_utt_brief_name());
	}

	if (corpus_get_seg(&seg, &n_frame) != S3_SUCCESS) {
	    E_FATAL("Unable to read Viterbi state segmentation for %s\n", corpus_utt_brief_name());
	}
	    
	n_word = str2words(trans, NULL, 0);
	word = ckd_calloc(n_word, sizeof(char*));
	str2words(trans, word, n_word);

	phone = mk_phone_list(&btw_mark, &n_phone, word, n_word, lex);
	start = ckd_calloc(n_phone, sizeof(uint32));
	len = ckd_calloc(n_phone, sizeof(uint32));

	/* check to see whether the word transcript and dictionary entries
	   agree with the state segmentation */
	if (ck_seg(acmod_set, phone, n_phone, seg, n_frame, corpus_utt()) != S3_SUCCESS) {
	    free(trans);	/* alloc'ed using strdup, not ckd_*() */
	    free(seg);	/* alloc'ed using malloc in areadshort(), not ckd_*() */
	    ckd_free(word);
	    ckd_free(phone);
	    
	    E_ERROR("ck_seg failed");

	    continue;
	}

	if (cvt2triphone(acmod_set, phone, btw_mark, n_phone) != S3_SUCCESS) {
	    free(trans);	/* alloc'ed using strdup, not ckd_*() */
	    free(seg);		/* alloc'ed using malloc in areadshort(), not ckd_*() */
	    ckd_free(word);
	    ckd_free(phone);

	    E_ERROR("cvt2triphone failed");
	    
	    continue;
	}

	ckd_free(btw_mark);

	if (mk_seg(acmod_set,
		   seg,
		   n_frame,
		   phone,
		   start,
		   len,
		   n_phone) != S3_SUCCESS) {
	    free(trans);
	    free(seg);
	    ckd_free(word);
	    ckd_free(phone);

	    E_ERROR("mk_seg failed");
	    continue;
	}
	
	if (corpus_provides_mfcc()) {
    	        if (corpus_get_generic_featurevec(&mfcc, &n_frame, mfc_veclen) < 0) {
		      E_FATAL("Can't read input features from %s\n", corpus_utt());
		}
		
		if (n_frame < 9) {
		  E_WARN("utt %s too short\n", corpus_utt());
		  if (mfcc) {
		    ckd_free(mfcc[0]);
		    ckd_free(mfcc);
		    mfcc = NULL;
		  }
		  continue;
		}

		feat = feat_array_alloc(fcb, n_frame + feat_window_size(fcb));
	        feat_s2mfc2feat_live(fcb, mfcc, &n_frame, TRUE, TRUE, feat);

		for (s = 0; s < n_phone; s++) {
		    segdmp_add_feat(phone[s],
				    &feat[start[s]],
				    len[s]);
		}

		feat_array_free(feat);
		free(&mfcc[0][0]);
		ckd_free(mfcc);
	}
	else {
	    E_FATAL("No data type specified\n");
	}

	free(trans);	/* alloc'ed using strdup, not ckd_*() */
	free(seg);	/* alloc'ed using malloc in areadshort(), not ckd_*() */
	ckd_free(word);
	ckd_free(phone);
	ckd_free(start);
	ckd_free(len);
    }

    return 0;
}
int32
mmi_viterbi_update(vector_t **feature,
		   uint32 n_obs,
		   state_t *state_seq,
		   uint32 n_state,
		   model_inventory_t *inv,
		   float64 a_beam,
		   int32 mean_reest,
		   int32 var_reest,
		   float64 arc_gamma,
		   feat_t *fcb)
{
    float64 *scale = NULL;
    float64 **dscale = NULL;
    float64 **active_alpha;
    uint32 **active_astate;
    uint32 **bp;
    uint32 *n_active_astate;
    gauden_t *g;/* Gaussian density parameters and reestimation sums */
    float32 ***mixw;/* all mixing weights */
    float64 ***now_den = NULL;/* Short for den[t] */
    uint32 ***now_den_idx = NULL;/* Short for den_idx[t] */
    uint32 *active_cb;
    uint32 n_active_cb;
    float32 ***denacc = NULL;/* mean/var reestimation accumulators for time t */
    size_t denacc_size;/* Total size of data references in denacc.  Allows
			  for quick clears between time frames */
    uint32 n_lcl_cb;
    uint32 *cb_inv;
    uint32 i, j, q;
    int32 t;
    uint32 n_feat;
    uint32 n_density;
    uint32 n_top;
    int ret;
    uint32 n_cb;

    static float64 *p_op = NULL;
    static float64 *p_ci_op = NULL;
    static float64 **d_term = NULL;
    static float64 **d_term_ci = NULL;

    /* caller must ensure that there is some non-zero amount
       of work to be done here */
    assert(n_obs > 0);
    assert(n_state > 0);

    g = inv->gauden;
    n_feat = gauden_n_feat(g);
    n_density = gauden_n_density(g);
    n_top = gauden_n_top(g);
    n_cb = gauden_n_mgau(g);

    if (p_op == NULL) {
	p_op    = ckd_calloc(n_feat, sizeof(float64));
	p_ci_op = ckd_calloc(n_feat, sizeof(float64));
    }

    if (d_term == NULL) {
	d_term    = (float64 **)ckd_calloc_2d(n_feat, n_top, sizeof(float64));
	d_term_ci = (float64 **)ckd_calloc_2d(n_feat, n_top, sizeof(float64));
    }

    scale = (float64 *)ckd_calloc(n_obs, sizeof(float64));
    dscale = (float64 **)ckd_calloc(n_obs, sizeof(float64 *));
    n_active_astate = (uint32 *)ckd_calloc(n_obs, sizeof(uint32));
    active_alpha  = (float64 **)ckd_calloc(n_obs, sizeof(float64 *));
    active_astate = (uint32 **)ckd_calloc(n_obs, sizeof(uint32 *));
    active_cb = ckd_calloc(2*n_state, sizeof(uint32));
    bp = (uint32 **)ckd_calloc(n_obs, sizeof(uint32 *));

    /* Run forward algorithm, which has embedded Viterbi. */
    ret = forward(active_alpha, active_astate, n_active_astate, bp,
		  scale, dscale,
		  feature, n_obs, state_seq, n_state,
		  inv, a_beam, NULL, 1);
    
    if (cmd_ln_str("-outphsegdir")) {
	E_FATAL("current MMI implementation don't support -outphsegdir\n");
    }


    if (ret != S3_SUCCESS) {

	/* Some problem with the utterance, release per utterance storage and
	 * forget about adding the utterance accumulators to the global accumulators */

	goto all_done;
    }

    mixw = inv->mixw;

    n_lcl_cb = inv->n_cb_inverse;
    cb_inv = inv->cb_inverse;

    /* Allocate local accumulators for mean, variance reestimation
       sums if necessary */
    gauden_alloc_l_acc(g, n_lcl_cb,
		       mean_reest, var_reest,
		       FALSE);

    n_active_cb = 0;
    now_den = (float64 ***)ckd_calloc_3d(n_lcl_cb,
					 n_feat,
					 n_top,
					 sizeof(float64));
    now_den_idx =  (uint32 ***)ckd_calloc_3d(n_lcl_cb,
					     n_feat,
					     n_top,
					     sizeof(uint32));

    if (mean_reest || var_reest) {
	/* allocate space for the per frame density counts */
	denacc = (float32 ***)ckd_calloc_3d(n_lcl_cb,
					    n_feat,
					    n_density,
					    sizeof(float32));

	/* # of bytes required to store all weighted vectors */
	denacc_size = n_lcl_cb * n_feat * n_density * sizeof(float32);
    }
    else {
	denacc = NULL;
	denacc_size = 0;
    }

    /* Okay now run through the backtrace and accumulate counts. */
    /* Find the non-emitting ending state */
    for (q = 0; q < n_active_astate[n_obs-1]; ++q) {
	if (active_astate[n_obs-1][q] == n_state-1)
	    break;
    }
    if (q == n_active_astate[n_obs-1]) {
	E_ERROR("Failed to align audio to trancript: final state of the search is not reached\n");
	ret = S3_ERROR;
	goto all_done;
    }

    for (t = n_obs-1; t >= 0; --t) {
	uint32 l_cb;
	uint32 l_ci_cb;
	float64 op, p_reest_term;
	uint32 prev;

	j = active_astate[t][q];

	/* Follow any non-emitting states at time t first. */
	while (state_seq[j].mixw == TYING_NON_EMITTING) {
	    prev = active_astate[t][bp[t][q]];
	    q = bp[t][q];
	    j = prev;
	}

	/* Now accumulate statistics for the real state. */
	l_cb = state_seq[j].l_cb;
	l_ci_cb = state_seq[j].l_ci_cb;
	n_active_cb = 0;

	gauden_compute_log(now_den[l_cb],
			   now_den_idx[l_cb],
			   feature[t],
			   g,
			   state_seq[j].cb,
			   NULL);
	active_cb[n_active_cb++] = l_cb;

	if (l_cb != l_ci_cb) {
	    gauden_compute_log(now_den[l_ci_cb],
			       now_den_idx[l_ci_cb],
			       feature[t],
			       g,
			       state_seq[j].ci_cb,
			       NULL);
	    active_cb[n_active_cb++] = l_ci_cb;
	}
	ret = gauden_scale_densities_bwd(now_den, now_den_idx,
					 &dscale[t],
					 active_cb, n_active_cb, g);
	if (ret != S3_SUCCESS)
	    goto all_done;
	
	assert(state_seq[j].mixw != TYING_NON_EMITTING);
	/* Now calculate mixture densities. */
	/* This is the normalizer sum_m c_{jm} p(o_t|\lambda_{jm}) */
	op = gauden_mixture(now_den[l_cb], now_den_idx[l_cb],
			    mixw[state_seq[j].mixw], g);

	/* Make up this bogus value to be consistent with backward.c */
	p_reest_term = 1.0 / op;

	/* Compute the output probability excluding the contribution
	 * of each feature stream.  i.e. p_op[0] is the output
	 * probability excluding feature stream 0 */
	partial_op(p_op,
		   op,
		   now_den[l_cb],
		   now_den_idx[l_cb],
		   mixw[state_seq[j].mixw],
		   n_feat,
		   n_top);

	/* compute the probability of each (of possibly topn) density */
	den_terms(d_term,
		  p_reest_term,
		  p_op,
		  now_den[l_cb],
		  now_den_idx[l_cb],
		  mixw[state_seq[j].mixw],
		  n_feat,
		  n_top);

	if (l_cb != l_ci_cb) {
	    /* For each feature stream f, compute:
	     *     sum_k(mixw[f][k] den[f][k])
	     * and store the results in p_ci_op */
	    partial_ci_op(p_ci_op,
			  now_den[l_ci_cb],
			  now_den_idx[l_ci_cb],
			  mixw[state_seq[j].ci_mixw],
			  n_feat,
			  n_top);

	    /* For each feature stream and density compute the terms:
	     *   w[f][k] den[f][k] / sum_k(w[f][k] den[f][k]) * post_j
	     * and store results in d_term_ci */
	    den_terms_ci(d_term_ci,
			 1.0, /* post_j = 1.0 */
			 p_ci_op,
			 now_den[l_ci_cb],
			 now_den_idx[l_ci_cb],
			 mixw[state_seq[j].ci_mixw],
			 n_feat,
			 n_top);
	}
	    
	/* accumulate the probability for each density in the 
	 * density reestimation accumulators */
	if (mean_reest || var_reest) {
	    accum_den_terms(denacc[l_cb], d_term,
			    now_den_idx[l_cb], n_feat, n_top);
	    if (l_cb != l_ci_cb) {
		accum_den_terms(denacc[l_ci_cb], d_term_ci,
				now_den_idx[l_ci_cb], n_feat, n_top);
	    }
	}
	
	/* Note that there is only one state/frame so this is kind of
	   redundant */
	if (mean_reest || var_reest) {
	    /* Update the mean and variance reestimation accumulators */
	    mmi_accum_gauden(denacc,
			     cb_inv,
			     n_lcl_cb,
			     feature[t],
			     now_den_idx,
			     g,
			     mean_reest,
			     var_reest,
			     arc_gamma,
			     fcb);
	    memset(&denacc[0][0][0], 0, denacc_size);
	}
	
	if (t > 0) { 
	    prev = active_astate[t-1][bp[t][q]];
	    q = bp[t][q];
	    j = prev;
	}
    }

    /* If no error was found, add the resulting utterance reestimation
     * accumulators to the global reestimation accumulators */
    accum_global(inv, state_seq, n_state,
		 FALSE, FALSE, mean_reest, var_reest,
		 FALSE);

 all_done:
    ckd_free((void *)scale);
    for (i = 0; i < n_obs; i++) {
	if (dscale[i])
	    ckd_free((void *)dscale[i]);
    }
    ckd_free((void **)dscale);
    
    ckd_free(n_active_astate);
    for (i = 0; i < n_obs; i++) {
	ckd_free((void *)active_alpha[i]);
	ckd_free((void *)active_astate[i]);
	ckd_free((void *)bp[i]);
    }
    ckd_free((void *)active_alpha);
    ckd_free((void *)active_astate);
    ckd_free((void *)active_cb);
    ckd_free((void **)bp);

    if (denacc)
	ckd_free_3d((void ***)denacc);

    if (now_den)
	ckd_free_3d((void ***)now_den);
    if (now_den_idx)
	ckd_free_3d((void ***)now_den_idx);

    if (ret != S3_SUCCESS)
	E_ERROR("viterbi update error in sentence %s\n", corpus_utt_brief_name());

    return ret;
}
int32
viterbi_update(float64 *log_forw_prob,
	       vector_t **feature,
	       uint32 n_obs,
	       state_t *state_seq,
	       uint32 n_state,
	       model_inventory_t *inv,
	       float64 a_beam,
	       float32 spthresh,
	       s3phseg_t *phseg,
	       int32 mixw_reest,
	       int32 tmat_reest,
	       int32 mean_reest,
	       int32 var_reest,
	       int32 pass2var,
	       int32 var_is_full,
	       FILE *pdumpfh,
	       feat_t *fcb)
{
    float64 *scale = NULL;
    float64 **dscale = NULL;
    float64 **active_alpha;
    uint32 **active_astate;
    uint32 **bp;
    uint32 *n_active_astate;
    gauden_t *g;		/* Gaussian density parameters and
				   reestimation sums */
    float32 ***mixw;		/* all mixing weights */
    float64 ***now_den = NULL;	/* Short for den[t] */
    uint32 ***now_den_idx = NULL;/* Short for den_idx[t] */
    uint32 *active_cb;
    uint32 n_active_cb;
    float32 **tacc;		/* Transition matrix reestimation sum accumulators
				   for the utterance. */
    float32 ***wacc;		/* mixing weight reestimation sum accumulators
				   for the utterance. */
    float32 ***denacc = NULL;	/* mean/var reestimation accumulators for time t */
    size_t denacc_size;		/* Total size of data references in denacc.  Allows
				   for quick clears between time frames */
    uint32 n_lcl_cb;
    uint32 *cb_inv;
    uint32 i, j, q;
    int32 t;
    uint32 n_feat;
    uint32 n_density;
    uint32 n_top;
    int ret;
    timing_t *fwd_timer = NULL;
    timing_t *rstu_timer = NULL;
    timing_t *gau_timer = NULL;
    timing_t *rsts_timer = NULL;
    timing_t *rstf_timer = NULL;
    float64 log_fp;	/* accumulator for the log of the probability
			 * of observing the input given the model */
    uint32 max_n_next = 0;
    uint32 n_cb;

    static float64 *p_op = NULL;
    static float64 *p_ci_op = NULL;
    static float64 **d_term = NULL;
    static float64 **d_term_ci = NULL;

    /* caller must ensure that there is some non-zero amount
       of work to be done here */
    assert(n_obs > 0);
    assert(n_state > 0);

    /* Get the forward estimation CPU timer */
    fwd_timer = timing_get("fwd");
    /* Get the per utterance reestimation CPU timer */
    rstu_timer = timing_get("rstu");
    /* Get the Gaussian density evaluation CPU timer */
    gau_timer = timing_get("gau");
    /* Get the per state reestimation CPU timer */
    rsts_timer = timing_get("rsts");
    /* Get the per frame reestimation CPU timer */
    rstf_timer = timing_get("rstf");

    g = inv->gauden;
    n_feat = gauden_n_feat(g);
    n_density = gauden_n_density(g);
    n_top = gauden_n_top(g);
    n_cb = gauden_n_mgau(g);

    if (p_op == NULL) {
	p_op    = ckd_calloc(n_feat, sizeof(float64));
	p_ci_op = ckd_calloc(n_feat, sizeof(float64));
    }

    if (d_term == NULL) {
	d_term    = (float64 **)ckd_calloc_2d(n_feat, n_top, sizeof(float64));
	d_term_ci = (float64 **)ckd_calloc_2d(n_feat, n_top, sizeof(float64));
    }

    scale = (float64 *)ckd_calloc(n_obs, sizeof(float64));
    dscale = (float64 **)ckd_calloc(n_obs, sizeof(float64 *));
    n_active_astate = (uint32 *)ckd_calloc(n_obs, sizeof(uint32));
    active_alpha  = (float64 **)ckd_calloc(n_obs, sizeof(float64 *));
    active_astate = (uint32 **)ckd_calloc(n_obs, sizeof(uint32 *));
    active_cb = ckd_calloc(2*n_state, sizeof(uint32));
    bp = (uint32 **)ckd_calloc(n_obs, sizeof(uint32 *));

    /* Run forward algorithm, which has embedded Viterbi. */
    if (fwd_timer)
	timing_start(fwd_timer);
    ret = forward(active_alpha, active_astate, n_active_astate, bp,
		  scale, dscale,
		  feature, n_obs, state_seq, n_state,
		  inv, a_beam, phseg, 0);
    /* Dump a phoneme segmentation if requested */
    if (cmd_ln_str("-outphsegdir")) {
	    const char *phsegdir;
	    char *segfn, *uttid;

	    phsegdir = cmd_ln_str("-outphsegdir");
	    uttid = (cmd_ln_int32("-outputfullpath")
		     ? corpus_utt_full_name() : corpus_utt());
	    segfn = ckd_calloc(strlen(phsegdir) + 1
			       + strlen(uttid)
			       + strlen(".phseg") + 1, 1);
	    strcpy(segfn, phsegdir);
	    strcat(segfn, "/");
	    strcat(segfn, uttid);
	    strcat(segfn, ".phseg");
	    write_phseg(segfn, inv, state_seq, active_astate, n_active_astate,
			n_state, n_obs, active_alpha, scale, bp);
	    ckd_free(segfn);
    }
    if (fwd_timer)
	timing_stop(fwd_timer);


    if (ret != S3_SUCCESS) {

	/* Some problem with the utterance, release per utterance storage and
	 * forget about adding the utterance accumulators to the global accumulators */

	goto all_done;
    }

    mixw = inv->mixw;

    if (mixw_reest) {
	/* Need to reallocate mixing accumulators for utt */
	if (inv->l_mixw_acc) {
	    ckd_free_3d((void ***)inv->l_mixw_acc);
	    inv->l_mixw_acc = NULL;
	}
	inv->l_mixw_acc = (float32 ***)ckd_calloc_3d(inv->n_mixw_inverse,
						     n_feat,
						     n_density,
						     sizeof(float32));
    }
    wacc = inv->l_mixw_acc;
    n_lcl_cb = inv->n_cb_inverse;
    cb_inv = inv->cb_inverse;

    /* Allocate local accumulators for mean, variance reestimation
       sums if necessary */
    gauden_alloc_l_acc(g, n_lcl_cb,
		       mean_reest, var_reest,
		       var_is_full);

    if (tmat_reest) {
	if (inv->l_tmat_acc) {
	    ckd_free_2d((void **)inv->l_tmat_acc);
	    inv->l_tmat_acc = NULL;
	}
	for (i = 0; i < n_state; i++) {
	    if (state_seq[i].n_next > max_n_next)
		max_n_next = state_seq[i].n_next;
	}
	inv->l_tmat_acc = (float32 **)ckd_calloc_2d(n_state,
						    max_n_next,
						    sizeof(float32));
    }
    /* transition matrix reestimation sum accumulators
       for the utterance */
    tacc = inv->l_tmat_acc;

    n_active_cb = 0;
    now_den = (float64 ***)ckd_calloc_3d(n_lcl_cb,
					 n_feat,
					 n_top,
					 sizeof(float64));
    now_den_idx =  (uint32 ***)ckd_calloc_3d(n_lcl_cb,
					     n_feat,
					     n_top,
					     sizeof(uint32));

    if (mean_reest || var_reest) {
	/* allocate space for the per frame density counts */
	denacc = (float32 ***)ckd_calloc_3d(n_lcl_cb,
					    n_feat,
					    n_density,
					    sizeof(float32));

	/* # of bytes required to store all weighted vectors */
	denacc_size = n_lcl_cb * n_feat * n_density * sizeof(float32);
    }
    else {
	denacc = NULL;
	denacc_size = 0;
    }

    /* Okay now run through the backtrace and accumulate counts. */
    /* Find the non-emitting ending state */
    for (q = 0; q < n_active_astate[n_obs-1]; ++q) {
	if (active_astate[n_obs-1][q] == n_state-1)
	    break;
    }
    if (q == n_active_astate[n_obs-1]) {
	E_ERROR("Failed to align audio to trancript: final state of the search is not reached\n");
	ret = S3_ERROR;
	goto all_done;
    }

    for (t = n_obs-1; t >= 0; --t) {
	uint32 l_cb;
	uint32 l_ci_cb;
	float64 op, p_reest_term;
	uint32 prev;

	j = active_astate[t][q];

	/* Follow any non-emitting states at time t first. */
	while (state_seq[j].mixw == TYING_NON_EMITTING) {
	    prev = active_astate[t][bp[t][q]];

#if VITERBI_DEBUG
	    printf("Following non-emitting state at time %d, %u => %u\n",
		   t, j, prev);
#endif
	    /* Backtrace and accumulate transition counts. */
	    if (tmat_reest) {
		assert(tacc != NULL);
		tacc[prev][j - prev] += 1.0;
	    }
	    q = bp[t][q];
	    j = prev;
	}

	/* Now accumulate statistics for the real state. */
	l_cb = state_seq[j].l_cb;
	l_ci_cb = state_seq[j].l_ci_cb;
	n_active_cb = 0;

	if (gau_timer)
	    timing_start(gau_timer);

	gauden_compute_log(now_den[l_cb],
			   now_den_idx[l_cb],
			   feature[t],
			   g,
			   state_seq[j].cb,
			   NULL);
	active_cb[n_active_cb++] = l_cb;

	if (l_cb != l_ci_cb) {
	    gauden_compute_log(now_den[l_ci_cb],
			       now_den_idx[l_ci_cb],
			       feature[t],
			       g,
			       state_seq[j].ci_cb,
			       NULL);
	    active_cb[n_active_cb++] = l_ci_cb;
	}
	gauden_scale_densities_bwd(now_den, now_den_idx,
				   &dscale[t],
				   active_cb, n_active_cb, g);

	assert(state_seq[j].mixw != TYING_NON_EMITTING);
	/* Now calculate mixture densities. */
	/* This is the normalizer sum_m c_{jm} p(o_t|\lambda_{jm}) */
	op = gauden_mixture(now_den[l_cb], now_den_idx[l_cb],
			    mixw[state_seq[j].mixw], g);
	if (gau_timer)
	    timing_stop(gau_timer);

	if (rsts_timer)
	    timing_start(rsts_timer);
	/* Make up this bogus value to be consistent with backward.c */
	p_reest_term = 1.0 / op;

	/* Compute the output probability excluding the contribution
	 * of each feature stream.  i.e. p_op[0] is the output
	 * probability excluding feature stream 0 */
	partial_op(p_op,
		   op,
		   now_den[l_cb],
		   now_den_idx[l_cb],
		   mixw[state_seq[j].mixw],
		   n_feat,
		   n_top);

	/* compute the probability of each (of possibly topn) density */
	den_terms(d_term,
		  p_reest_term,
		  p_op,
		  now_den[l_cb],
		  now_den_idx[l_cb],
		  mixw[state_seq[j].mixw],
		  n_feat,
		  n_top);

	if (l_cb != l_ci_cb) {
	    /* For each feature stream f, compute:
	     *     sum_k(mixw[f][k] den[f][k])
	     * and store the results in p_ci_op */
	    partial_ci_op(p_ci_op,
			  now_den[l_ci_cb],
			  now_den_idx[l_ci_cb],
			  mixw[state_seq[j].ci_mixw],
			  n_feat,
			  n_top);

	    /* For each feature stream and density compute the terms:
	     *   w[f][k] den[f][k] / sum_k(w[f][k] den[f][k]) * post_j
	     * and store results in d_term_ci */
	    den_terms_ci(d_term_ci,
			 1.0, /* post_j = 1.0 */
			 p_ci_op,
			 now_den[l_ci_cb],
			 now_den_idx[l_ci_cb],
			 mixw[state_seq[j].ci_mixw],
			 n_feat,
			 n_top);
	}
		    

	/* accumulate the probability for each density in the mixing
	 * weight reestimation accumulators */
	if (mixw_reest) {
	    accum_den_terms(wacc[state_seq[j].l_mixw], d_term,
			    now_den_idx[l_cb], n_feat, n_top);

	    /* check if mixw and ci_mixw are different to avoid
	     * doubling the EM counts in a CI run. */
	    if (state_seq[j].mixw != state_seq[j].ci_mixw) {
                if (n_cb < inv->n_mixw) {
                    /* semi-continuous, tied mixture, and discrete case */
		    accum_den_terms(wacc[state_seq[j].l_ci_mixw], d_term,
				    now_den_idx[l_cb], n_feat, n_top);
		}
		else {
		    /* continuous case */
		    accum_den_terms(wacc[state_seq[j].l_ci_mixw], d_term_ci,
				    now_den_idx[l_ci_cb], n_feat, n_top);
		}
	    }
	}
		    
	/* accumulate the probability for each density in the 
	 * density reestimation accumulators */
	if (mean_reest || var_reest) {
	    accum_den_terms(denacc[l_cb], d_term,
			    now_den_idx[l_cb], n_feat, n_top);
	    if (l_cb != l_ci_cb) {
		accum_den_terms(denacc[l_ci_cb], d_term_ci,
				now_den_idx[l_ci_cb], n_feat, n_top);
	    }
	}
		
	if (rsts_timer)
	    timing_stop(rsts_timer);
	/* Note that there is only one state/frame so this is kind of
	   redundant */
 	if (rstf_timer)
	    timing_start(rstf_timer);
	if (mean_reest || var_reest) {
	    /* Update the mean and variance reestimation accumulators */
	    if (pdumpfh)
		fprintf(pdumpfh, "time %d:\n", t);
	    accum_gauden(denacc,
			 cb_inv,
			 n_lcl_cb,
			 feature[t],
			 now_den_idx,
			 g,
			 mean_reest,
			 var_reest,
			 pass2var,
			 inv->l_mixw_acc,
			 var_is_full,
			 pdumpfh,
			 fcb);
	    memset(&denacc[0][0][0], 0, denacc_size);
	}
	if (rstf_timer)
	    timing_stop(rstf_timer);

	if (t > 0) { 
	    prev = active_astate[t-1][bp[t][q]];
#if VITERBI_DEBUG
	    printf("Backtrace at time %d, %u => %u\n",
		   t, j, prev);
#endif
	    /* Backtrace and accumulate transition counts. */
	    if (tmat_reest) {
		assert(tacc != NULL);
		tacc[prev][j-prev] += 1.0;
	    }
	    q = bp[t][q];
	    j = prev;
	}
    }

    /* If no error was found, add the resulting utterance reestimation
     * accumulators to the global reestimation accumulators */
    if (rstu_timer)
	timing_start(rstu_timer);
    accum_global(inv, state_seq, n_state,
		 mixw_reest, tmat_reest, mean_reest, var_reest,
		 var_is_full);
    if (rstu_timer)
	timing_stop(rstu_timer);

    /* Find the final state */
    for (i = 0; i < n_active_astate[n_obs-1]; ++i) {
	if (active_astate[n_obs-1][i] == n_state-1)
	    break;
    }
    /* Calculate log[ p( O | \lambda ) ] */
    assert(active_alpha[n_obs-1][i] > 0);
    log_fp = log(active_alpha[n_obs-1][i]);
    for (t = 0; t < n_obs; t++) {
	assert(scale[t] > 0);
	log_fp -= log(scale[t]);
	for (j = 0; j < inv->gauden->n_feat; j++) {
	    log_fp += dscale[t][j];
	}
    }

    *log_forw_prob = log_fp;

 all_done:
    ckd_free((void *)scale);
    for (i = 0; i < n_obs; i++) {
	if (dscale[i])
	    ckd_free((void *)dscale[i]);
    }
    ckd_free((void **)dscale);
    
    ckd_free(n_active_astate);
    for (i = 0; i < n_obs; i++) {
	ckd_free((void *)active_alpha[i]);
	ckd_free((void *)active_astate[i]);
	ckd_free((void *)bp[i]);
    }
    ckd_free((void *)active_alpha);
    ckd_free((void *)active_astate);
    ckd_free((void *)active_cb);

    if (denacc)
	ckd_free_3d((void ***)denacc);

    if (now_den)
	ckd_free_3d((void ***)now_den);
    if (now_den_idx)
	ckd_free_3d((void ***)now_den_idx);

    if (ret != S3_SUCCESS)
	E_ERROR("%s ignored\n", corpus_utt_brief_name());

    return ret;
}
/* the following functions are used for MMIE training
   lqin 2010-03 */
int32
mmi_viterbi_run(float64 *log_forw_prob,
		vector_t **feature,
		uint32 n_obs,
		state_t *state_seq,
		uint32 n_state,
		model_inventory_t *inv,
		float64 a_beam)
{
    float64 *scale = NULL;
    float64 **dscale = NULL;
    float64 **active_alpha;
    uint32 **active_astate;
    uint32 **bp;
    uint32 *n_active_astate;
    uint32 *active_cb;
    uint32 i;
    int ret;
    int final_state_error = 0;
    float64 log_fp;/* accumulator for the log of the probability
		    * of observing the input given the model */
    
    /* caller must ensure that there is some non-zero amount
       of work to be done here */
    assert(n_obs > 0);
    assert(n_state > 0);
    
    scale = (float64 *)ckd_calloc(n_obs, sizeof(float64));
    dscale = (float64 **)ckd_calloc(n_obs, sizeof(float64 *));
    n_active_astate = (uint32 *)ckd_calloc(n_obs, sizeof(uint32));
    active_alpha  = (float64 **)ckd_calloc(n_obs, sizeof(float64 *));
    active_astate = (uint32 **)ckd_calloc(n_obs, sizeof(uint32 *));
    active_cb = ckd_calloc(2*n_state, sizeof(uint32));
    bp = (uint32 **)ckd_calloc(n_obs, sizeof(uint32 *));

    /* Run forward algorithm, which has embedded Viterbi. */
    ret = forward(active_alpha, active_astate, n_active_astate, bp,
		  scale, dscale,
		  feature, n_obs, state_seq, n_state,
		  inv, a_beam, NULL, 1);

    if (ret != S3_SUCCESS) {

	/* Some problem with the utterance, release per utterance storage and
	 * forget about adding the utterance accumulators to the global accumulators */
	
	goto all_done;
    }

    /* Find the non-emitting ending state */
    for (i = 0; i < n_active_astate[n_obs-1]; ++i) {
	if (active_astate[n_obs-1][i] == n_state-1)
	    break;
    }
    if (i == n_active_astate[n_obs-1]) {
	/* since there are so many such errors during the mmie training,
	      it's very annoying to output this error message
	      E_ERROR("Failed to align audio to trancript: final state of the search is not reached\n"); */
	ret = S3_ERROR;
	final_state_error = 1;
	goto all_done;
    }

    /* Calculate log[ p( O | \lambda ) ] */
    assert(active_alpha[n_obs-1][i] > 0);
    log_fp = log(active_alpha[n_obs-1][i]);

    *log_forw_prob = log_fp;

 all_done:
    ckd_free((void *)scale);
    for (i = 0; i < n_obs; i++) {
	if (dscale[i])
	    ckd_free((void *)dscale[i]);
    }
    ckd_free((void **)dscale);
    
    ckd_free(n_active_astate);
    for (i = 0; i < n_obs; i++) {
	ckd_free((void *)active_alpha[i]);
	ckd_free((void *)active_astate[i]);
	ckd_free((void *)bp[i]);
    }
    ckd_free((void *)active_alpha);
    ckd_free((void *)active_astate);
    ckd_free((void *)active_cb);
    ckd_free((void **)bp);

    if (ret != S3_SUCCESS && !final_state_error)
	E_ERROR("viterbi run error in sentence %s\n", corpus_utt_brief_name());
    
    return ret;
}
Esempio n. 5
0
int32
baum_welch_update(float64 *log_forw_prob,
                  vector_t **feature,
                  uint32 n_obs,
                  state_t *state,
                  uint32 n_state,
                  model_inventory_t *inv,
                  float64 a_beam,
                  float64 b_beam,
                  float32 spthresh,
                  s3phseg_t *phseg,
                  int32 mixw_reest,
                  int32 tmat_reest,
                  int32 mean_reest,
                  int32 var_reest,
                  int32 pass2var,
                  int32 var_is_full,
                  FILE *pdumpfh,
                  bw_timers_t *timers,
                  feat_t *fcb)
{
    float64 *scale = NULL;
    float64 **dscale = NULL;
    float64 **active_alpha;
    uint32 **active_astate;
    uint32 **bp;
    uint32 *n_active_astate;
    float64 log_fp;	/* accumulator for the log of the probability
			 * of observing the input given the model */
    uint32 t;		/* time */
    int ret;
    uint32 i,j;

    /* caller must ensure that there is some non-zero amount
       of work to be done here */
    assert(n_obs > 0);
    assert(n_state > 0);

    scale = (float64 *)ckd_calloc(n_obs, sizeof(float64));
    dscale = (float64 **)ckd_calloc(n_obs, sizeof(float64 *));
    n_active_astate = (uint32 *)ckd_calloc(n_obs, sizeof(uint32));
    active_alpha  = (float64 **)ckd_calloc(n_obs, sizeof(float64 *));
    active_astate = (uint32 **)ckd_calloc(n_obs, sizeof(uint32 *));
    bp = (uint32 **)ckd_calloc(n_obs, sizeof(uint32 *));

    /* Compute the scaled alpha variable and scale factors
     * for all states and time subject to the pruning constraints */
    if (timers)
        ptmr_start(&timers->fwd_timer);

    /*
     * Debug?
     *   E_INFO("Before Forward search\n");
     */
    ret = forward(active_alpha, active_astate, n_active_astate, bp,
                  scale, dscale,
                  feature, n_obs, state, n_state,
                  inv, a_beam, phseg, timers, 0);

#if BW_DEBUG
    for (i=0 ; i < n_obs; i++) {
        E_INFO("Number of active states %d at time %d\n",n_active_astate[i],i);
        E_INFO("Scale of time %d is %e \n",i,scale[i]);
        for(j=0 ; j < n_active_astate[i]; j++) {
            E_INFO("Active state: %d Active alpha: %e\n",active_astate[i][j], active_alpha[i][j]);
        }
    }
    i=0;
    j=0;
#endif

    /* Dump a phoneme segmentation if requested */
    if (cmd_ln_str("-outphsegdir")) {
        const char *phsegdir;
        char *segfn, *uttid;

        phsegdir = cmd_ln_str("-outphsegdir");
        uttid = (cmd_ln_int32("-outputfullpath")
                 ? corpus_utt_full_name() : corpus_utt());
        segfn = ckd_calloc(strlen(phsegdir) + 1
                           + strlen(uttid)
                           + strlen(".phseg") + 1, 1);
        strcpy(segfn, phsegdir);
        strcat(segfn, "/");
        strcat(segfn, uttid);
        strcat(segfn, ".phseg");
        write_phseg(segfn, inv, state, active_astate, n_active_astate,
                    n_state, n_obs, active_alpha, scale, bp);
        ckd_free(segfn);
    }

    if (timers)
        ptmr_stop(&timers->fwd_timer);

    if (ret != S3_SUCCESS) {

        /* Some problem with the utterance, release per utterance storage and
         * forget about adding the utterance accumulators to the global accumulators */

        goto error;
    }

    /* Compute the scaled beta variable and update the reestimation
     * sums */
    if (timers)
        ptmr_start(&timers->bwd_timer);

#if BW_DEBUG
    E_INFO("Before Backward search\n");
#endif

    ret = backward_update(active_alpha, active_astate, n_active_astate, scale, dscale,
                          feature, n_obs,
                          state, n_state,
                          inv, b_beam, spthresh,
                          mixw_reest, tmat_reest, mean_reest, var_reest, pass2var,
                          var_is_full, pdumpfh, timers, fcb);
    if (timers)
        ptmr_stop(&timers->bwd_timer);

    if (ret != S3_SUCCESS) {

        /* Some problem with the utterance, release per utterance storage and
         * forget about adding the utterance accumulators to the global accumulators */

        goto error;
    }

#if BW_DEBUG
    E_INFO("Before Global Accumulation\n");
#endif

    /* If no error was found in the forward or backward procedures,
     * add the resulting utterance reestimation accumulators to the
     * global reestimation accumulators */
    if (timers)
        ptmr_start(&timers->rstu_timer);
    accum_global(inv, state, n_state,
                 mixw_reest, tmat_reest, mean_reest, var_reest,
                 var_is_full);
    if (timers)
        ptmr_stop(&timers->rstu_timer);

    for (i = 0; i < n_active_astate[n_obs-1] && active_astate[n_obs-1][i] != (n_state-1); i++);

    assert(i < n_active_astate[n_obs-1]);

    /* Calculate log[ p( O | \lambda ) ] */
    assert(active_alpha[n_obs-1][i] > 0);
    log_fp = log(active_alpha[n_obs-1][i]);
    for (t = 0; t < n_obs; t++) {
        assert(scale[t] > 0);
        log_fp -= log(scale[t]);
        for (j = 0; j < inv->gauden->n_feat; j++) {
            log_fp += dscale[t][j];
        }
    }

    *log_forw_prob = log_fp;

    ckd_free((void *)scale);
    ckd_free(n_active_astate);
    for (i = 0; i < n_obs; i++) {
        ckd_free((void *)active_alpha[i]);
        ckd_free((void *)active_astate[i]);
        ckd_free((void *)dscale[i]);
        ckd_free((void *)bp[i]);
    }
    ckd_free((void *)active_alpha);
    ckd_free((void *)active_astate);
    ckd_free((void **)dscale);
    ckd_free(bp);

    return S3_SUCCESS;

error:
    ckd_free((void *)scale);
    for (i = 0; i < n_obs; i++) {
        if (dscale[i])
            ckd_free((void *)dscale[i]);
    }
    ckd_free((void **)dscale);

    ckd_free(n_active_astate);
    for (i = 0; i < n_obs; i++) {
        ckd_free((void *)active_alpha[i]);
        ckd_free((void *)active_astate[i]);
        ckd_free((void *)bp[i]);
    }
    ckd_free((void *)active_alpha);
    ckd_free((void *)active_astate);
    ckd_free(bp);

    E_ERROR("%s ignored\n", corpus_utt_brief_name());

    return S3_ERROR;
}