// method: // hmm_evaluate // // description: // double hmm_evaluate( HiddenMarkovModel_t * hmm, Vector_t * observation ) { assert( hmm != 0 ); double likelihood = 0.0; Vector_t * coefficients = 0; // calculate forward probabilities Matrix_t * fwd = hmm_forward( hmm, observation, &coefficients ); for( int i = 0; i < mat_xsize( coefficients ); i++ ) { likelihood += log( mat_get1d( coefficients, i ) ); } // clean up memory mat_deallocate( fwd ); mat_deallocate( coefficients ); // return result return exp( likelihood ); }
void psmc_decode(const psmc_par_t *pp, const psmc_data_t *pd) { hmm_par_t *hp = pd->hp; int i, k, prev, start; FLOAT p, q, *t, *t2, *t_min; double *cnt = 0; int32_t n_cnt; // compute the time intervals and the coalescent average t = (FLOAT*)malloc(sizeof(FLOAT) * (pp->n + 1)); for (k = 0; k <= pp->n; ++k) { t[k] = (pd->t[k] + 1.0 - (pd->t[k+1] - pd->t[k]) / (exp(pd->t[k+1]) / exp(pd->t[k]) - 1.0)) / pd->C_pi; if (pp->flag & PSMC_F_FULLDEC) fprintf(pp->fpout, "TC\t%d\t%lf\t%lf\t%lf\n", k, t[k], pd->t[k], pd->t[k+1]); } t2 = (FLOAT*)malloc(sizeof(FLOAT) * pp->n_free); t_min = (FLOAT*)malloc(sizeof(FLOAT) * pp->n_free); t_min[0] = 0; for (k = i = 0, p = 0; k < pp->n_free; ++k) { for (; i < pp->n; ++i) if (pp->par_map[i] == k) break; t_min[k] = pd->t[i]; prev = i; for (; i < pp->n; ++i) if (pp->par_map[i] > k) break; t2[k] = (pd->t[prev] + 1.0 - (pd->t[i] - pd->t[prev]) / (exp(pd->t[i]) / exp(pd->t[prev]) - 1.0)) / pd->C_pi; } if (pp->fpcnt) { fread(&n_cnt, 4, 1, pp->fpcnt); // read the number of counts per base cnt = (double*)calloc((pp->n + 1) * n_cnt, sizeof(double)); } // the core part hmm_pre_backward(hp); for (i = 0; i != pp->n_seqs; ++i) { hmm_data_t *hd; psmc_seq_t *s = pp->seqs + i; char *seq = (char*)calloc(s->L+1, 1); memcpy(seq, s->seq, s->L); hd = hmm_new_data(s->L, seq, hp); hmm_forward(hp, hd); hmm_backward(hp, hd); if (!(pp->flag & PSMC_F_FULLDEC) && (pp->flag & PSMC_F_DECODE)) { // posterior decoding int *x, kl; hmm_post_decode(hp, hd); /* show path */ x = hd->p; start = 1; prev = x[1]; p = hd->f[1][prev] * hd->b[1][prev] * hd->s[1]; for (k = 2; k <= s->L; ++k) { if (prev != x[k]) { kl = pp->par_map[prev]; fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.5f\t%.5f\t%.5f\n", s->name, start, k-1, kl, t_min[kl], t2[kl], kl == pp->n_free-1? pp->max_t * 2. : t_min[kl+1]); // fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.3lf\t%.2lf\n", s->name, start, k-1, prev, t[prev], p); prev = x[k]; start = k; p = 0.0; } q = hd->f[k][x[k]] * hd->b[k][x[k]] * hd->s[k]; if (p < q) p = q; } // fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.3lf\t%.2lf\n", s->name, start, k-1, prev, t[prev], p); kl = pp->par_map[prev]; fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.5f\t%.5f\t%.5f\n", s->name, start, k-1, kl, t_min[kl], t2[kl], kl == pp->n_free-1? pp->max_t * 2. : t_min[kl+1]); fflush(pp->fpout); } else if (pp->flag & PSMC_F_DECODE) { // full decoding FLOAT *prob = (FLOAT*)malloc(sizeof(FLOAT) * hp->n); for (k = 1; k <= s->L; ++k) { int l; FLOAT p, *fu, *bu1, *eu1; // p is the recombination probability? if (k < s->L) { p = 0.0; fu = hd->f[k]; bu1 = hd->b[k+1]; eu1 = hp->e[(int)hd->seq[k+1]]; for (l = 0; l < hp->n; ++l) p += fu[l] * hp->a[l][l] * bu1[l] * eu1[l]; p = 1.0 - p; } else p = 0.0; hmm_post_state(hp, hd, k, prob); fprintf(pp->fpout, "DF\t%d\t%lf", k, p); for (l = 0; l < hp->n; ++l) fprintf(pp->fpout, "\t%.4f", prob[l]); fprintf(pp->fpout, "\n"); } free(prob); } if (pp->fpcnt) { // very similar to full decoding above int32_t *cnt1, l; FLOAT *prob = (FLOAT*)malloc(sizeof(FLOAT) * hp->n); fread(&l, 4, 1, pp->fpcnt); assert(l >= s->L); // FIXME: if there are very short sequence in the input, fpcnt may be different from the input!!! cnt1 = malloc(l * n_cnt * 4); fread(cnt1, n_cnt * l, 4, pp->fpcnt); for (k = 1; k <= s->L; ++k) { int j, l; hmm_post_state(hp, hd, k, prob); for (l = 0; l < hp->n; ++l) for (j = 0; j < n_cnt; ++j) cnt[l*n_cnt + j] += prob[l] * cnt1[(k-1)*n_cnt + j]; } free(cnt1); free(prob); } /* free */ hmm_delete_data(hd); free(seq); } if (pp->fpcnt) { for (i = 0; i < hp->n; ++i) { fprintf(pp->fpout, "CT\t%d", i); for (k = 0; k < n_cnt; ++k) fprintf(pp->fpout, "\t%f", cnt[i*n_cnt + k]); fprintf(pp->fpout, "\n"); } } free(t); free(t2); free(t_min); free(cnt); }
int main(int argc, char *argv[]) { char c; List *l; int i, j, strand, bed_output = 0, backgd_nmods = -1, feat_nmods = -1, winsize = -1, verbose = 0, max_nmods, memblocksize, old_nleaves, refidx = 1, base_by_base = FALSE, windowWig = FALSE; TreeModel **backgd_mods = NULL, **feat_mods = NULL; HMM *backgd_hmm = NULL, *feat_hmm = NULL; msa_format_type inform = UNKNOWN_FORMAT; GFF_Set *features = NULL; MSA *msa, *msa_compl=NULL; double **backgd_emissions, **feat_emissions, **mem, **dummy_emissions, *winscore_pos=NULL, *winscore_neg=NULL; int *no_alignment=NULL; List *pruned_names; char *msa_fname; FILE *infile; int opt_idx; struct option long_opts[] = { {"background-mods", 1, 0, 'b'}, {"background-hmm", 1, 0, 'B'}, {"feature-mods", 1, 0, 'f'}, {"feature-hmm", 1, 0, 'F'}, {"features", 1, 0, 'g'}, {"window", 1, 0, 'w'}, {"window-wig", 1, 0, 'W'}, {"base-by-base", 0, 0, 'y'}, {"msa-format", 1, 0, 'i'}, {"refidx", 1, 0, 'r'}, {"output-bed", 0, 0, 'd'}, {"verbose", 0, 0, 'v'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "B:b:F:f:r:g:w:W:i:ydvh", long_opts, &opt_idx)) != -1) { switch (c) { case 'B': backgd_hmm = hmm_new_from_file(phast_fopen(optarg, "r")); break; case 'b': l = get_arg_list(optarg); backgd_nmods = lst_size(l); backgd_mods = smalloc(backgd_nmods * sizeof(void*)); for (i = 0; i < backgd_nmods; i++) backgd_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1); lst_free_strings(l); lst_free(l); break; case 'F': feat_hmm = hmm_new_from_file(phast_fopen(optarg, "r")); break; case 'f': l = get_arg_list(optarg); feat_nmods = lst_size(l); feat_mods = smalloc(feat_nmods * sizeof(void*)); for (i = 0; i < feat_nmods; i++) feat_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1); lst_free_strings(l); lst_free(l); break; case 'g': features = gff_read_set(phast_fopen(optarg, "r")); break; case 'w': winsize = get_arg_int(optarg); if (winsize <= 0) die("ERROR: window size must be positive.\n"); break; case 'W': winsize = get_arg_int(optarg); if (winsize <= 0) die("ERROR: window size must be positive.\n"); windowWig = TRUE; break; case 'y': base_by_base = TRUE; break; case 'i': inform = msa_str_to_format(optarg); if (inform == UNKNOWN_FORMAT) die("Bad argument to -i.\n"); break; case 'r': refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'd': bed_output = 1; break; case 'h': printf("%s", HELP); exit(0); case 'v': verbose = 1; break; case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } set_seed(-1); if (backgd_mods == NULL || feat_mods == NULL) die("ERROR: -b and -f required. Try '%s -h'.\n", argv[0]); if (backgd_nmods == 1 && backgd_hmm == NULL) backgd_hmm = hmm_create_trivial(); else if (backgd_hmm == NULL) die("ERROR: -B required. Try '%s -h'.\n", argv[0]); if (feat_nmods == 1 && feat_hmm == NULL) feat_hmm = hmm_create_trivial(); else if (feat_hmm == NULL) die("ERROR: -F required. Try '%s -h'.\n", argv[0]); if ((winsize == -1 && features == NULL && !base_by_base) || (winsize != -1 && features != NULL) || (winsize != -1 && base_by_base) || (features != NULL && base_by_base)) die("ERROR: must specify exactly one of -g, -w, and -y. Try '%s -h'.\n", argv[0]); if (backgd_hmm->nstates != backgd_nmods) die("ERROR: number of states must equal number of tree models for background.\n"); if (feat_hmm->nstates != feat_nmods) die("ERROR: number of states must equal number of tree models for features.\n"); if (features != NULL && lst_size(features->features) == 0) die("ERROR: empty features file.\n"); if (base_by_base && (backgd_nmods > 1 || feat_nmods > 1)) die("ERROR: only single phylogenetic models (not HMMs) are supported with --base-by-base.\n"); if (optind != argc - 1) die("ERROR: too few arguments. Try '%s -h'.\n", argv[0]); if (verbose) fprintf(stderr, "Reading alignment ...\n"); msa_fname = argv[optind]; infile = phast_fopen(msa_fname, "r"); if (inform == UNKNOWN_FORMAT) inform = msa_format_for_content(infile, 1); if (inform == MAF) msa = maf_read(infile, NULL, 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); else msa = msa_new_from_file_define_format(infile, inform, NULL); if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* need ordered representation of alignment */ if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL) ) die("ERROR: ordered sufficient statistics are required.\n"); pruned_names = lst_new_ptr(msa->nseqs); for (i = 0; i < backgd_nmods; i++) { old_nleaves = (backgd_mods[i]->tree->nnodes + 1) / 2; tm_prune(backgd_mods[i], msa, pruned_names); if (lst_size(pruned_names) >= old_nleaves) die("ERROR: no match for leaves of tree in alignment (background model #%d)\n", i+1); else if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves in background model (#%d) with no match in alignment (", i+1); for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); } for (i = 0; i < feat_nmods; i++) { old_nleaves = (feat_mods[i]->tree->nnodes + 1) / 2; tm_prune(feat_mods[i], msa, pruned_names); if (lst_size(pruned_names) >= old_nleaves) die("ERROR: no match for leaves of tree in alignment (features model #%d)\n", i+1); else if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves in features model (#%d) with no match in alignment (", i+1); for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); } lst_free(pruned_names); /* first have to subtract offset from features, if necessary */ if (msa->idx_offset != 0 && features != NULL) { for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); f->start -= msa->idx_offset; f->end -= msa->idx_offset; } } /* convert to coord frame of alignment */ if (features != NULL && refidx != 0) { if (verbose) fprintf(stderr, "Mapping coordinates ...\n"); msa_map_gff_coords(msa, features, refidx, 0, 0); if (lst_size(features->features) == 0) die("ERROR: no features within coordinate range of alignment.\n"); } /* Make a reverse complemented copy of the alignment. The two strands will be processed separately, to avoid problems with overlapping features, etc. */ if (!base_by_base) { /* skip in base by base case */ if (verbose) fprintf(stderr, "Creating reverse complemented alignment ...\n"); msa_compl = msa_create_copy(msa, 0); /* temporary workaround: make sure reverse complement not based on sufficient stats */ if (msa_compl->seqs == NULL) ss_to_msa(msa_compl); if (msa_compl->ss != NULL) { ss_free(msa_compl->ss); msa_compl->ss = NULL; } msa_reverse_compl(msa_compl); } /* allocate memory for computing scores */ backgd_emissions = smalloc(backgd_nmods * sizeof(void*)); for (i = 0; i < backgd_nmods; i++) backgd_emissions[i] = smalloc(msa->length * sizeof(double)); feat_emissions = smalloc(feat_nmods * sizeof(void*)); for (i = 0; i < feat_nmods; i++) feat_emissions[i] = smalloc(msa->length * sizeof(double)); max_nmods = max(backgd_nmods, feat_nmods); dummy_emissions = smalloc(max_nmods * sizeof(void*)); mem = smalloc(max_nmods * sizeof(void*)); /* memory for forward algorithm -- each block must be as large as the largest feature */ if (features != NULL) { for (i = 0, memblocksize = -1; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); if (f->end - f->start + 1 > memblocksize) memblocksize = f->end - f->start + 1; } } else memblocksize = winsize; /* -1 if base-by-base mode */ if (memblocksize > 0) for (i = 0; i < max_nmods; i++) mem[i] = smalloc(memblocksize * sizeof(double)); if (winsize != -1) { winscore_pos = smalloc(msa->length * sizeof(double)); winscore_neg = smalloc(msa->length * sizeof(double)); no_alignment = smalloc(msa->length * sizeof(int)); for (i = 0; i < msa->length; i++) { winscore_pos[i] = winscore_neg[i] = NEGINFTY; if (refidx == 0) no_alignment[i] = FALSE; else no_alignment[i] = msa_missing_col(msa, refidx, i); } } /* the rest will be repeated for each strand */ for (strand = 1; strand <= 2; strand++) { MSA *thismsa = strand == 1 ? msa : msa_compl; double *winscore = strand == 1 ? winscore_pos : winscore_neg; if (base_by_base && strand == 2) break; /* don't do second pass in base_by_base case */ if (verbose) fprintf(stderr, "Processing %c strand ...\n", strand == 1 ? '+' : '-'); /* set up dummy categories array, so that emissions are only computed where needed */ thismsa->categories = smalloc(thismsa->length * sizeof(int)); thismsa->ncats = 1; if (winsize != -1) { if (strand == 1) for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = no_alignment[i] ? 0 : 1; else for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = no_alignment[thismsa->length - i - 1] ? 0 : 1; } else if (features != NULL) { for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 0; for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); if (f->start <= 0 || f->end <= 0) { fprintf(stderr, "WARNING: feature out of range ('"); gff_print_feat(stderr, f); fprintf(stderr, "')\n"); continue; } if (strand == 1 && f->strand != '-') for (j = f->start - 1; j < f->end; j++) thismsa->categories[j] = 1; else if (strand == 2 && f->strand == '-') for (j = thismsa->length - f->end; j < thismsa->length - f->start + 1; j++) thismsa->categories[j] = 1; } } else { /* base-by-base scores */ for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 1; } if (thismsa->ss != NULL) ss_update_categories(thismsa); /* compute emissions */ for (i = 0; i < backgd_nmods; i++) { if (verbose) fprintf(stderr, "Computing emissions for background model #%d ...\n", i+1); tl_compute_log_likelihood(backgd_mods[i], thismsa, backgd_emissions[i], NULL, 1, NULL); } for (i = 0; i < feat_nmods; i++) { if (verbose) fprintf(stderr, "Computing emissions for features model #%d ...\n", i+1); tl_compute_log_likelihood(feat_mods[i], thismsa, feat_emissions[i], NULL, 1, NULL); } /* now compute scores */ if (winsize != -1) { /* windows case */ int winstart; if (verbose) fprintf(stderr, "Computing scores ...\n"); for (winstart = 0; winstart <= thismsa->length - winsize; winstart++) { int centeridx = winstart + winsize/2; if (strand == 2) centeridx = thismsa->length - centeridx - 1; if (no_alignment[centeridx]) continue; for (j = 0; j < feat_nmods; j++) dummy_emissions[j] = &(feat_emissions[j][winstart]); winscore[centeridx] = hmm_forward(feat_hmm, dummy_emissions, winsize, mem); if (winscore[centeridx] <= NEGINFTY) { winscore[centeridx] = NEGINFTY; continue; } for (j = 0; j < backgd_nmods; j++) dummy_emissions[j] = &(backgd_emissions[j][winstart]); winscore[centeridx] -= hmm_forward(backgd_hmm, dummy_emissions, winsize, mem); if (winscore[centeridx] < NEGINFTY) winscore[centeridx] = NEGINFTY; } } else if (features != NULL) { /* features case */ if (verbose) fprintf(stderr, "Computing scores ...\n"); for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); int s, e; if ((strand == 1 && f->strand == '-') || (strand == 2 && f->strand != '-') || f->start <= 0 || f->end <= 0 || f->end - f->start < 0) continue; /* effective coords */ if (f->strand == '-') { s = thismsa->length - f->end + 1; e = thismsa->length - f->start + 1; } else { s = f->start; e = f->end; } f->score_is_null = 0; for (j = 0; j < feat_nmods; j++) dummy_emissions[j] = &(feat_emissions[j][s-1]); f->score = hmm_forward(feat_hmm, dummy_emissions, e - s + 1, mem); if (f->score <= NEGINFTY) { f->score = NEGINFTY; continue; } for (j = 0; j < backgd_nmods; j++) dummy_emissions[j] = &(backgd_emissions[j][s-1]); f->score -= hmm_forward(backgd_hmm, dummy_emissions, e - s + 1, mem); if (f->score < NEGINFTY) f->score = NEGINFTY; } } } if (verbose) fprintf(stderr, "Generating output ...\n"); if (winsize != -1 && windowWig == FALSE) { /* standard windows output */ for (i = 0, j = 0; i < msa->length; i++) { if (no_alignment[i] == FALSE) printf("%d\t%.3f\t%.3f\n", j + msa->idx_offset + 1, winscore_pos[i], winscore_neg[i]); if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) j++; } } else if (windowWig == TRUE) { /* windows with wig output */ int last = NEGINFTY; for (i = 0, j = 0; i < msa->length; i++) { if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) { if (no_alignment[i] == FALSE && winscore_pos[i] > NEGINFTY) { if (j > last + 1) printf("fixedStep chrom=%s start=%d step=1\n", refidx > 0 ? msa->names[refidx-1] : "alignment", j + msa->idx_offset + 1); printf("%.3f\n", winscore_pos[i]); last = j; } j++; } } } else if (features != NULL) { /* features output */ /* return to coord frame of reference seq (also, replace offset) */ if (refidx != 0) msa_map_gff_coords(msa, features, 0, refidx, msa->idx_offset); else if (msa->idx_offset != 0) { for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); f->start += msa->idx_offset; f->end += msa->idx_offset; } } if (bed_output) gff_print_bed(stdout, features, FALSE); else gff_print_set(stdout, features); } else { /* base-by-base scores */ /* in this case, we can just output the difference between the emissions */ printf("fixedStep chrom=%s start=%d step=1\n", refidx > 0 ? msa->names[refidx-1] : "alignment", msa->idx_offset + 1); for (i = 0, j = 0; i < msa->length; i++) { if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) { printf("%.3f\n", feat_emissions[0][i] - backgd_emissions[0][i]); j++; } } } if (verbose) fprintf(stderr, "\nDone.\n"); return 0; }
// method: // hmm_train // // description: // double hmm_train( HiddenMarkovModel_t * hmm, Vector_t * observations[], int count, int iterations, double tolerance ) { assert( hmm != 0 ); double new_likelihood = 0.0; if( ( iterations != 0 ) || ( tolerance != 0.0 ) ) { //int N = sizeof( observations ) / sizeof( Vector_t * ); int N = count; int current_iteration = 1; int stop = 0; // initialize epsilon (aka, ksi or psi) and gamma Matrix_t * epsilon[ N ]; Matrix_t * gamma[ N ]; for( int i = 0; i < N; i++ ) { int T = mat_xsize( observations[ i ] ); epsilon[ i ] = mat_allocate3d( T, hmm->_states, hmm->_states ); gamma[ i ] = mat_allocate2d( T, hmm->_states ); } // initial log likelihood double old_likelihood = 0.0; // train until done (max iterations or converged within tolerance) do { // train for each sequence in observations for( int i = 0; i < N; i++ ) { Vector_t * sequence = observations[ i ]; int T = mat_xsize( sequence ); Vector_t * scaling = 0; // (a) calculate forward and backward probability Matrix_t * fwd = hmm_forward( hmm, sequence, &scaling ); Matrix_t * bwd = hmm_backward( hmm, sequence, scaling ); // (b) calculate the frequency of the transition-emission pair valus // and divide by the probability of the entire sequence // // calculate gamma for( int t = 0; t < T; t++ ) { double s = 0.0; for( int k = 0; k < hmm->_states; k++ ) { double gv = mat_get2d( fwd, t, k ) * mat_get2d( bwd, t, k ); mat_set2d( gamma[ i ], gv, t, k ); s += gv; } if( s != 0.0 ) { for( int k = 0; k < hmm->_states; k++ ) { double gv = mat_get2d( gamma[ i ], t, k ); mat_set2d( gamma[ i ], (gv / s), t, k ); } } } // calculate epsilon for( int t = 0; t < T - 1; t++ ) { double s = 0.0; for( int k = 0; k < hmm->_states; k++ ) { for( int l = 0; l < hmm->_states; l++ ) { int next_symbol = (int)(mat_get1d( sequence, t + 1 )); double gv = mat_get2d( fwd, t, k ) * mat_get2d( bwd, t + 1, l ); double ev = gv * mat_get2d( hmm->_A, k, l ) * mat_get2d( hmm->_B, l, next_symbol ); mat_set3d( epsilon[ i ], ev, t, k, l ); s += ev; } } if( s != 0.0 ) { for( int k = 0; k < hmm->_states; k++ ) { for( int l = 0; l < hmm->_states; l++ ) { double ev = mat_get3d( epsilon[ i ], t, k, l ); mat_set3d( epsilon[ i ], (ev / s ), t, k, l ); } } } } // calculate log likelihood for( int t = 0; t < mat_xsize( scaling ); t++ ) { new_likelihood += log( mat_get1d( scaling, t ) ); } // free working fwd, bwd and scaling matrix mat_deallocate( fwd ); mat_deallocate( bwd ); mat_deallocate( scaling ); scaling = 0; } // average likelihood new_likelihood /= (double)N; // check for convergence if( hmm_has_converged( old_likelihood, new_likelihood, current_iteration, iterations, tolerance ) != 0 ) { stop = 1; } else { // (c) calculate parameter re-estimation ++current_iteration; old_likelihood = new_likelihood; new_likelihood = 0.0; // re-estimate initial state for( int k = 0; k < hmm->_states; k++ ) { double s = 0.0; for( int i = 0; i < N; i++ ) { s += mat_get2d( gamma[ i ], 0, k ); } mat_set1d( hmm->_pi, (s / N), k ); } // re-estimate transition probabilities for( int i = 0; i < hmm->_states; i++ ) { for( int j = 0; j < hmm->_states; j++ ) { double den = 0.0; double num = 0.0; for( int k = 0; k < N; k++ ) { int T = mat_xsize( observations[ k ] ); for( int l = 0; l < T - 1; l++ ) { double ev = mat_get3d( epsilon[ k ], l, i, j ); double gv = mat_get2d( gamma[ k ], l, i ); num += ev; den += gv; } } double av = (den != 0.0) ? num / den : 0.0; mat_set2d( hmm->_A, av, i, j ); } } // re-estimation emission probabilities for( int i = 0; i < hmm->_states; i++ ) { for( int j = 0; j < hmm->_symbols; j++ ) { double den = 0.0; double num = 0.0; for( int k = 0; k < N; k++ ) { int T = mat_xsize( observations[ k ] ); for( int l = 0; l < T; l++ ) { double gv = mat_get2d( gamma[ k ], l, i ); int ov = (int)(mat_get1d( observations[ k ], l )); if( ov == j ) num += gv; den += gv; } } double bv = (num == 0.0) ? 1e-10 : num / den; mat_set2d( hmm->_B, bv, i, j ); } } } } while( stop == 0 ); // free epsilon and gamma for( int i = 0; i < N; i++ ) { mat_deallocate( epsilon[ i ] ); mat_deallocate( gamma[ i ] ); } } return new_likelihood; }
double lnl_wrapper(Vector *params, void *data) { BDPhyloHmm *bdphmm = data; unpack_params(params, data); return -hmm_forward(bdphmm->phmm->hmm, bdphmm->phmm->emissions, bdphmm->phmm->alloc_len, bdphmm->phmm->forward); }