void psmc_decode(const psmc_par_t *pp, const psmc_data_t *pd) { hmm_par_t *hp = pd->hp; int i, k, prev, start; FLOAT p, q, *t, *t2, *t_min; double *cnt = 0; int32_t n_cnt; // compute the time intervals and the coalescent average t = (FLOAT*)malloc(sizeof(FLOAT) * (pp->n + 1)); for (k = 0; k <= pp->n; ++k) { t[k] = (pd->t[k] + 1.0 - (pd->t[k+1] - pd->t[k]) / (exp(pd->t[k+1]) / exp(pd->t[k]) - 1.0)) / pd->C_pi; if (pp->flag & PSMC_F_FULLDEC) fprintf(pp->fpout, "TC\t%d\t%lf\t%lf\t%lf\n", k, t[k], pd->t[k], pd->t[k+1]); } t2 = (FLOAT*)malloc(sizeof(FLOAT) * pp->n_free); t_min = (FLOAT*)malloc(sizeof(FLOAT) * pp->n_free); t_min[0] = 0; for (k = i = 0, p = 0; k < pp->n_free; ++k) { for (; i < pp->n; ++i) if (pp->par_map[i] == k) break; t_min[k] = pd->t[i]; prev = i; for (; i < pp->n; ++i) if (pp->par_map[i] > k) break; t2[k] = (pd->t[prev] + 1.0 - (pd->t[i] - pd->t[prev]) / (exp(pd->t[i]) / exp(pd->t[prev]) - 1.0)) / pd->C_pi; } if (pp->fpcnt) { fread(&n_cnt, 4, 1, pp->fpcnt); // read the number of counts per base cnt = (double*)calloc((pp->n + 1) * n_cnt, sizeof(double)); } // the core part hmm_pre_backward(hp); for (i = 0; i != pp->n_seqs; ++i) { hmm_data_t *hd; psmc_seq_t *s = pp->seqs + i; char *seq = (char*)calloc(s->L+1, 1); memcpy(seq, s->seq, s->L); hd = hmm_new_data(s->L, seq, hp); hmm_forward(hp, hd); hmm_backward(hp, hd); if (!(pp->flag & PSMC_F_FULLDEC) && (pp->flag & PSMC_F_DECODE)) { // posterior decoding int *x, kl; hmm_post_decode(hp, hd); /* show path */ x = hd->p; start = 1; prev = x[1]; p = hd->f[1][prev] * hd->b[1][prev] * hd->s[1]; for (k = 2; k <= s->L; ++k) { if (prev != x[k]) { kl = pp->par_map[prev]; fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.5f\t%.5f\t%.5f\n", s->name, start, k-1, kl, t_min[kl], t2[kl], kl == pp->n_free-1? pp->max_t * 2. : t_min[kl+1]); // fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.3lf\t%.2lf\n", s->name, start, k-1, prev, t[prev], p); prev = x[k]; start = k; p = 0.0; } q = hd->f[k][x[k]] * hd->b[k][x[k]] * hd->s[k]; if (p < q) p = q; } // fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.3lf\t%.2lf\n", s->name, start, k-1, prev, t[prev], p); kl = pp->par_map[prev]; fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.5f\t%.5f\t%.5f\n", s->name, start, k-1, kl, t_min[kl], t2[kl], kl == pp->n_free-1? pp->max_t * 2. : t_min[kl+1]); fflush(pp->fpout); } else if (pp->flag & PSMC_F_DECODE) { // full decoding FLOAT *prob = (FLOAT*)malloc(sizeof(FLOAT) * hp->n); for (k = 1; k <= s->L; ++k) { int l; FLOAT p, *fu, *bu1, *eu1; // p is the recombination probability? if (k < s->L) { p = 0.0; fu = hd->f[k]; bu1 = hd->b[k+1]; eu1 = hp->e[(int)hd->seq[k+1]]; for (l = 0; l < hp->n; ++l) p += fu[l] * hp->a[l][l] * bu1[l] * eu1[l]; p = 1.0 - p; } else p = 0.0; hmm_post_state(hp, hd, k, prob); fprintf(pp->fpout, "DF\t%d\t%lf", k, p); for (l = 0; l < hp->n; ++l) fprintf(pp->fpout, "\t%.4f", prob[l]); fprintf(pp->fpout, "\n"); } free(prob); } if (pp->fpcnt) { // very similar to full decoding above int32_t *cnt1, l; FLOAT *prob = (FLOAT*)malloc(sizeof(FLOAT) * hp->n); fread(&l, 4, 1, pp->fpcnt); assert(l >= s->L); // FIXME: if there are very short sequence in the input, fpcnt may be different from the input!!! cnt1 = malloc(l * n_cnt * 4); fread(cnt1, n_cnt * l, 4, pp->fpcnt); for (k = 1; k <= s->L; ++k) { int j, l; hmm_post_state(hp, hd, k, prob); for (l = 0; l < hp->n; ++l) for (j = 0; j < n_cnt; ++j) cnt[l*n_cnt + j] += prob[l] * cnt1[(k-1)*n_cnt + j]; } free(cnt1); free(prob); } /* free */ hmm_delete_data(hd); free(seq); } if (pp->fpcnt) { for (i = 0; i < hp->n; ++i) { fprintf(pp->fpout, "CT\t%d", i); for (k = 0; k < n_cnt; ++k) fprintf(pp->fpout, "\t%f", cnt[i*n_cnt + k]); fprintf(pp->fpout, "\n"); } } free(t); free(t2); free(t_min); free(cnt); }
// method: // hmm_train // // description: // double hmm_train( HiddenMarkovModel_t * hmm, Vector_t * observations[], int count, int iterations, double tolerance ) { assert( hmm != 0 ); double new_likelihood = 0.0; if( ( iterations != 0 ) || ( tolerance != 0.0 ) ) { //int N = sizeof( observations ) / sizeof( Vector_t * ); int N = count; int current_iteration = 1; int stop = 0; // initialize epsilon (aka, ksi or psi) and gamma Matrix_t * epsilon[ N ]; Matrix_t * gamma[ N ]; for( int i = 0; i < N; i++ ) { int T = mat_xsize( observations[ i ] ); epsilon[ i ] = mat_allocate3d( T, hmm->_states, hmm->_states ); gamma[ i ] = mat_allocate2d( T, hmm->_states ); } // initial log likelihood double old_likelihood = 0.0; // train until done (max iterations or converged within tolerance) do { // train for each sequence in observations for( int i = 0; i < N; i++ ) { Vector_t * sequence = observations[ i ]; int T = mat_xsize( sequence ); Vector_t * scaling = 0; // (a) calculate forward and backward probability Matrix_t * fwd = hmm_forward( hmm, sequence, &scaling ); Matrix_t * bwd = hmm_backward( hmm, sequence, scaling ); // (b) calculate the frequency of the transition-emission pair valus // and divide by the probability of the entire sequence // // calculate gamma for( int t = 0; t < T; t++ ) { double s = 0.0; for( int k = 0; k < hmm->_states; k++ ) { double gv = mat_get2d( fwd, t, k ) * mat_get2d( bwd, t, k ); mat_set2d( gamma[ i ], gv, t, k ); s += gv; } if( s != 0.0 ) { for( int k = 0; k < hmm->_states; k++ ) { double gv = mat_get2d( gamma[ i ], t, k ); mat_set2d( gamma[ i ], (gv / s), t, k ); } } } // calculate epsilon for( int t = 0; t < T - 1; t++ ) { double s = 0.0; for( int k = 0; k < hmm->_states; k++ ) { for( int l = 0; l < hmm->_states; l++ ) { int next_symbol = (int)(mat_get1d( sequence, t + 1 )); double gv = mat_get2d( fwd, t, k ) * mat_get2d( bwd, t + 1, l ); double ev = gv * mat_get2d( hmm->_A, k, l ) * mat_get2d( hmm->_B, l, next_symbol ); mat_set3d( epsilon[ i ], ev, t, k, l ); s += ev; } } if( s != 0.0 ) { for( int k = 0; k < hmm->_states; k++ ) { for( int l = 0; l < hmm->_states; l++ ) { double ev = mat_get3d( epsilon[ i ], t, k, l ); mat_set3d( epsilon[ i ], (ev / s ), t, k, l ); } } } } // calculate log likelihood for( int t = 0; t < mat_xsize( scaling ); t++ ) { new_likelihood += log( mat_get1d( scaling, t ) ); } // free working fwd, bwd and scaling matrix mat_deallocate( fwd ); mat_deallocate( bwd ); mat_deallocate( scaling ); scaling = 0; } // average likelihood new_likelihood /= (double)N; // check for convergence if( hmm_has_converged( old_likelihood, new_likelihood, current_iteration, iterations, tolerance ) != 0 ) { stop = 1; } else { // (c) calculate parameter re-estimation ++current_iteration; old_likelihood = new_likelihood; new_likelihood = 0.0; // re-estimate initial state for( int k = 0; k < hmm->_states; k++ ) { double s = 0.0; for( int i = 0; i < N; i++ ) { s += mat_get2d( gamma[ i ], 0, k ); } mat_set1d( hmm->_pi, (s / N), k ); } // re-estimate transition probabilities for( int i = 0; i < hmm->_states; i++ ) { for( int j = 0; j < hmm->_states; j++ ) { double den = 0.0; double num = 0.0; for( int k = 0; k < N; k++ ) { int T = mat_xsize( observations[ k ] ); for( int l = 0; l < T - 1; l++ ) { double ev = mat_get3d( epsilon[ k ], l, i, j ); double gv = mat_get2d( gamma[ k ], l, i ); num += ev; den += gv; } } double av = (den != 0.0) ? num / den : 0.0; mat_set2d( hmm->_A, av, i, j ); } } // re-estimation emission probabilities for( int i = 0; i < hmm->_states; i++ ) { for( int j = 0; j < hmm->_symbols; j++ ) { double den = 0.0; double num = 0.0; for( int k = 0; k < N; k++ ) { int T = mat_xsize( observations[ k ] ); for( int l = 0; l < T; l++ ) { double gv = mat_get2d( gamma[ k ], l, i ); int ov = (int)(mat_get1d( observations[ k ], l )); if( ov == j ) num += gv; den += gv; } } double bv = (num == 0.0) ? 1e-10 : num / den; mat_set2d( hmm->_B, bv, i, j ); } } } } while( stop == 0 ); // free epsilon and gamma for( int i = 0; i < N; i++ ) { mat_deallocate( epsilon[ i ] ); mat_deallocate( gamma[ i ] ); } } return new_likelihood; }