float64 best_q(float32 ****mixw, /* ADDITION FOR CONTINUOUS_TREES 21 May 98 */ float32 ****means, float32 ****vars, uint32 *veclen, /* END ADDITION FOR CONTINUOUS_TREES */ uint32 n_model, uint32 n_state, uint32 n_stream, uint32 n_density, float32 *stwt, uint32 **dfeat, uint32 n_dfeat, quest_t *all_q, uint32 n_all_q, pset_t *pset, uint32 *id, uint32 n_id, float32 ***dist, /* ADDITION FOR CONTINUOUS_TREES 21 May 98 */ float64 node_wt_ent, /* Weighted entropy of node */ /* END ADDITION FOR CONTINUOUS_TREES */ quest_t **out_best_q) { float32 ***yes_dist; /* ADDITION FOR CONTINUOUS_TREES */ float32 ***yes_means=0; float32 ***yes_vars=0; float32 varfloor=0; float64 y_ent; /* END ADDITION FOR CONTINUOUS_TREES */ float64 yes_dnom, yes_norm; uint32 *yes_id; float32 ***no_dist; /* ADDITION FOR CONTINUOUS_TREES */ float32 ***no_means=0; float32 ***no_vars=0; float64 n_ent; /* END ADDITION FOR CONTINUOUS_TREES */ float64 no_dnom, no_norm; uint32 *no_id; uint32 n_yes, n_b_yes = 0; uint32 n_no, n_b_no = 0; uint32 i, j, k, q, b_q=0, s; uint32 ii; float64 einc, b_einc = -1.0e+50; /* ADDITION FOR CONTINUOUS_TREES; 20 May 98 */ char* type; uint32 continuous, sumveclen=0; type = (char *)cmd_ln_access("-ts2cbfn"); if (strcmp(type,".semi.")!=0 && strcmp(type,".cont.") != 0) E_FATAL("Type %s unsupported; trees can only be built on types .semi. or .cont.\n",type); if (strcmp(type,".cont.") == 0) continuous = 1; else continuous = 0; if (continuous == 1) { varfloor = *(float32 *)cmd_ln_access("-varfloor"); /* Allocating for sumveclen is overallocation, but it eases coding */ for (ii=0,sumveclen=0; ii<n_stream; ii++) sumveclen += veclen[ii]; yes_means = (float32 ***)ckd_calloc_3d(n_state,n_stream,sumveclen,sizeof(float32)); yes_vars = (float32 ***)ckd_calloc_3d(n_state,n_stream,sumveclen,sizeof(float32)); no_means = (float32 ***)ckd_calloc_3d(n_state,n_stream,sumveclen,sizeof(float32)); no_vars = (float32 ***)ckd_calloc_3d(n_state,n_stream,sumveclen,sizeof(float32)); } /* END ADDITIONS FOR CONTINUOUS_TREES */ n_yes = n_no = 0; yes_dist = (float32 ***)ckd_calloc_3d(n_state, n_stream, n_density, sizeof(float32)); no_dist = (float32 ***)ckd_calloc_3d(n_state, n_stream, n_density, sizeof(float32)); for (q = 0; q < n_all_q; q++) { memset(&yes_dist[0][0][0], 0, sizeof(float32) * n_state * n_stream * n_density); memset(&no_dist[0][0][0], 0, sizeof(float32) * n_state * n_stream * n_density); /* ADDITION FOR CONTINUOUS_TREES; If continuous hmm initialize means and vars to zero */ if (continuous == 1) { memset(&yes_means[0][0][0], 0, sizeof(float32) * n_state * n_stream * sumveclen); memset(&yes_vars[0][0][0], 0, sizeof(float32) * n_state * n_stream * sumveclen); memset(&no_means[0][0][0], 0, sizeof(float32) * n_state * n_stream * sumveclen); memset(&no_vars[0][0][0], 0, sizeof(float32) * n_state * n_stream * sumveclen); } /* END ADDITION FOR CONTINUOUS_TREES */ n_yes = n_no = 0; for (ii = 0; ii < n_id; ii++) { i = id[ii]; if (eval_quest(&all_q[q], dfeat[i], n_dfeat)) { for (s = 0; s < n_state; s++) { for (j = 0; j < n_stream; j++) { for (k = 0; k < n_density; k++) { yes_dist[s][j][k] += mixw[i][s][j][k]; } } } /* MODIFICATION FOR CONTINUOUS_TREES: ADDITIONS FOR CONTINUOUS CASE */ if (continuous == 1) { for (s = 0; s < n_state; s++) { for (j = 0; j < n_stream; j++) { for (k = 0; k < veclen[j]; k++) { yes_means[s][j][k] += mixw[i][s][j][0] * means[i][s][j][k]; yes_vars[s][j][k] += mixw[i][s][j][0] * (vars[i][s][j][k] + means[i][s][j][k]*means[i][s][j][k]); } } } } /* END MODIFICATION FOR CONTINUOUS_TREES */ ++n_yes; } else { for (s = 0; s < n_state; s++) { for (j = 0; j < n_stream; j++) { for (k = 0; k < n_density; k++) { no_dist[s][j][k] += mixw[i][s][j][k]; } } } /* MODIFICATION FOR CONTINUOUS_TREES: ADDITIONS FOR CONTINUOUS CASE */ if (continuous == 1) { for (s = 0; s < n_state; s++) { for (j = 0; j < n_stream; j++) { for (k = 0; k < veclen[j]; k++) { no_means[s][j][k] += mixw[i][s][j][0] * means[i][s][j][k]; no_vars[s][j][k] += mixw[i][s][j][0] * (vars[i][s][j][k] + means[i][s][j][k]*means[i][s][j][k]); } } } } /* END MODIFICATION FOR CONTINUOUS_TREES */ ++n_no; } } if ((n_yes == 0) || (n_no == 0)) { /* no split. All satisfy or all don't satisfy */ continue; } for (s = 0, einc = 0; s < n_state; s++) { for (k = 0, yes_dnom = 0; k < n_density; k++) { yes_dnom += yes_dist[s][0][k]; } if (yes_dnom == 0) break; yes_norm = 1.0 / yes_dnom; for (j = 0; j < n_stream; j++) { for (k = 0; k < n_density; k++) { yes_dist[s][j][k] *= yes_norm; } } for (k = 0, no_dnom = 0; k < n_density; k++) { no_dnom += no_dist[s][0][k]; } if (no_dnom == 0) break; no_norm = 1.0 / no_dnom; for (j = 0; j < n_stream; j++) { for (k = 0; k < n_density; k++) { no_dist[s][j][k] *= no_norm; } } /* MODIFICATION FOR CONTINUOUS_TREES: Do appropriate operations for discrete and continuous */ if (continuous == 1) { y_ent = 0; n_ent = 0; for (j = 0; j < n_stream; j++) { if (yes_dnom != 0) { for (k = 0; k < veclen[j]; k++) { yes_means[s][j][k] *= yes_norm; yes_vars[s][j][k] = yes_vars[s][j][k]*yes_norm - yes_means[s][j][k]*yes_means[s][j][k]; if (yes_vars[s][j][k] < varfloor) yes_vars[s][j][k] = varfloor; } } if (no_dnom != 0) { for (k = 0; k < veclen[j]; k++) { no_means[s][j][k] *= no_norm; no_vars[s][j][k] = no_vars[s][j][k]*no_norm - no_means[s][j][k]*no_means[s][j][k]; if (no_vars[s][j][k] < varfloor) no_vars[s][j][k] = varfloor; } } y_ent += yes_dnom * ent_cont(yes_means[s][j],yes_vars[s][j],veclen[j]); n_ent += no_dnom * ent_cont(no_means[s][j],no_vars[s][j],veclen[j]); } einc += (float64)stwt[s] * (y_ent + n_ent); } else { einc += (float64)stwt[s] * wt_ent_inc(yes_dist[s], yes_dnom, no_dist[s], no_dnom, dist[s], n_stream, n_density); } } /* END MODIFICATION FOR CONTINUOUS_TREES */ /* ADDITION FOR CONTINUOUS_TREES; In current code this is true only for continous HMM */ if (continuous == 1) { einc -= node_wt_ent; } /* END ADDITION FOR CONTINUOUS_TREES */ if (s < n_state) { /* Ended iteration over states prematurely; assume 'bad' question */ continue; } if (einc > b_einc) { b_einc = einc; b_q = q; n_b_yes = n_yes; n_b_no = n_no; } } if ((n_b_yes == 0) || (n_b_no == 0)) { /* No best question */ *out_best_q = NULL; return 0; } yes_id = (uint32 *)ckd_calloc(n_b_yes, sizeof(uint32)); no_id = (uint32 *)ckd_calloc(n_b_no, sizeof(uint32)); memset(&yes_dist[0][0][0], 0, sizeof(float32) * n_state * n_stream * n_density); memset(&no_dist[0][0][0], 0, sizeof(float32) * n_state * n_stream * n_density); n_yes = n_no = 0; for (ii = 0; ii < n_id; ii++) { i = id[ii]; if (eval_quest(&all_q[b_q], dfeat[i], n_dfeat)) { for (s = 0; s < n_state; s++) { for (j = 0; j < n_stream; j++) { for (k = 0; k < n_density; k++) { yes_dist[s][j][k] += mixw[i][s][j][k]; } } } yes_id[n_yes] = i; ++n_yes; } else { for (s = 0; s < n_state; s++) { for (j = 0; j < n_stream; j++) { for (k = 0; k < n_density; k++) { no_dist[s][j][k] += mixw[i][s][j][k]; } } } no_id[n_no] = i; ++n_no; } } ckd_free_3d((void ***)yes_dist); ckd_free((void *)yes_id); ckd_free_3d((void ***)no_dist); ckd_free((void *)no_id); /* ADDITION FOR CONTINUOUS_TREES */ if (continuous == 1) { ckd_free_3d((void ***)yes_means); ckd_free_3d((void ***)yes_vars); ckd_free_3d((void ***)no_means); ckd_free_3d((void ***)no_vars); } /* END ADDITION FOR CONTINUOUS_TREES */ *out_best_q = &all_q[b_q]; return b_einc; }
int mk_node(dtree_node_t *node, uint32 node_id, uint32 *id, uint32 n_id, float32 ****mixw, float32 ****means, float32 ****vars, uint32 *veclen, uint32 n_model, uint32 n_state, uint32 n_stream, uint32 n_density, float32 *stwt, float32 mwfloor) { float32 ***mixw_occ, **dist; uint32 mm, m, s, j, k; float64 *dnom, norm, wt_ent, s_wt_ent, occ; float32 mx_wt; uint32 *l_id; float32 ***lmeans=0,***lvars=0; float32 varfloor=0; uint32 continuous, sumveclen; char* type; type = (char *)cmd_ln_str("-ts2cbfn"); if (strcmp(type,".semi.")!=0 && strcmp(type,".cont.") != 0) E_FATAL("Type %s unsupported; trees can only be built on types .semi. or .cont.\n",type); if (strcmp(type,".cont.") == 0) continuous = 1; else continuous = 0; if (continuous == 1) { varfloor = cmd_ln_float32("-varfloor"); /* Sumveclen is overallocation, but coding is simpler */ for (j=0,sumveclen=0; j < n_stream; j++) sumveclen += veclen[j]; lmeans = (float32 ***) ckd_calloc_3d(n_state,n_stream,sumveclen,sizeof(float32)); lvars = (float32 ***) ckd_calloc_3d(n_state,n_stream,sumveclen,sizeof(float32)); } mixw_occ = (float32 ***)ckd_calloc_3d(n_state, n_stream, n_density, sizeof(float32)); dist = (float32 **)ckd_calloc_2d(n_stream, n_density, sizeof(float32)); dnom = (float64 *)ckd_calloc(n_stream, sizeof(float64)); /* Merge distributions of all the elements in a cluster for combined distribution */ for (s = 0; s < n_state; s++) { for (j = 0; j < n_stream; j++) { float32 *lmeanvec=0, *lvarvec=0; if (continuous == 1) { lmeanvec = lmeans[s][j]; lvarvec = lvars[s][j]; } for (mm = 0; mm < n_id; mm++) { m = id[mm]; for (k = 0; k < n_density; k++) { mixw_occ[s][j][k] += mixw[m][s][j][k]; } /* For continuous hmms we have only one gaussian per state */ if (continuous == 1) { for (k = 0; k < veclen[j]; k++) { lmeanvec[k] += mixw[m][s][j][0] * means[m][s][j][k]; lvarvec[k] += mixw[m][s][j][0] * (vars[m][s][j][k] + means[m][s][j][k] * means[m][s][j][k]); } } } if (continuous == 1) { if (mixw_occ[s][j][0] != 0) { for (k = 0; k < veclen[j]; k++) { lmeanvec[k] /= mixw_occ[s][j][0]; lvarvec[k] = lvarvec[k]/mixw_occ[s][j][0] - lmeanvec[k]*lmeanvec[k]; if (lvarvec[k] < varfloor) lvarvec[k] = varfloor; } } else { for (k = 0; k < veclen[j]; k++) if (lmeanvec[k] != 0) E_FATAL("denominator = 0, but numerator = %f at k = %d\n",lmeanvec[k],k); } } } } /* Find out which state is under consideration */ for (j = 0, mx_wt = 0, s = 0; s < n_state; s++) { if (stwt[s] > mx_wt) { mx_wt = stwt[s]; j = s; } } /* occ is the same for each independent feature, so just choose 0 */ for (k = 0, occ = 0; k < n_density; k++) { occ += mixw_occ[j][0][k]; } for (s = 0, wt_ent = 0; s < n_state; s++) { for (j = 0; j < n_stream; j++) { for (k = 0, dnom[j] = 0; k < n_density; k++) { dnom[j] += mixw_occ[s][j][k]; } } for (j = 0, s_wt_ent = 0; j < n_stream; j++) { norm = 1.0 / dnom[j]; /* discrete_entropy for discrete case, continuous entropy for continuous HMMs */ if (continuous != 1) { for (k = 0; k < n_density; k++) { dist[j][k] = mixw_occ[s][j][k] * norm; if (dist[j][k] < mwfloor) dist[j][k] = mwfloor; } s_wt_ent += dnom[j] * ent_d(dist[j], n_density); } else { s_wt_ent += dnom[j] * ent_cont(lmeans[s][j], lvars[s][j], veclen[j]); } } wt_ent += stwt[s] * s_wt_ent; } node->node_id = node_id; l_id = ckd_calloc(n_id, sizeof(uint32)); for (j = 0; j < n_id; j++) { l_id[j] = id[j]; } node->id = l_id; node->n_id = n_id; node->mixw_occ = mixw_occ; if (continuous == 1) { node->means = lmeans; node->vars = lvars; } node->occ = occ; node->wt_ent = wt_ent; ckd_free_2d((void **)dist); ckd_free((void *)dnom); return S3_SUCCESS; }