Example #1
0
File: aux.c Project: AB-O/psmc-1
void psmc_decode(const psmc_par_t *pp, const psmc_data_t *pd)
{
	hmm_par_t *hp = pd->hp;
	int i, k, prev, start;
	FLOAT p, q, *t, *t2, *t_min;
	double *cnt = 0;
	int32_t n_cnt;
	// compute the time intervals and the coalescent average
	t = (FLOAT*)malloc(sizeof(FLOAT) * (pp->n + 1));
	for (k = 0; k <= pp->n; ++k) {
		t[k] = (pd->t[k] + 1.0 - (pd->t[k+1] - pd->t[k]) / (exp(pd->t[k+1]) / exp(pd->t[k]) - 1.0)) / pd->C_pi;
		if (pp->flag & PSMC_F_FULLDEC) fprintf(pp->fpout, "TC\t%d\t%lf\t%lf\t%lf\n", k, t[k], pd->t[k], pd->t[k+1]);
	}
	t2 = (FLOAT*)malloc(sizeof(FLOAT) * pp->n_free);
	t_min = (FLOAT*)malloc(sizeof(FLOAT) * pp->n_free);
	t_min[0] = 0;
	for (k = i = 0, p = 0; k < pp->n_free; ++k) {
		for (; i < pp->n; ++i) if (pp->par_map[i] == k) break;
		t_min[k] = pd->t[i];
		prev = i;
		for (; i < pp->n; ++i) if (pp->par_map[i] > k) break;
		t2[k] = (pd->t[prev] + 1.0 - (pd->t[i] - pd->t[prev]) / (exp(pd->t[i]) / exp(pd->t[prev]) - 1.0)) / pd->C_pi;
	}
	if (pp->fpcnt) {
		fread(&n_cnt, 4, 1, pp->fpcnt); // read the number of counts per base
		cnt = (double*)calloc((pp->n + 1) * n_cnt, sizeof(double));
	}
	// the core part
	hmm_pre_backward(hp);
	for (i = 0; i != pp->n_seqs; ++i) {
		hmm_data_t *hd;
		psmc_seq_t *s = pp->seqs + i;
		char *seq = (char*)calloc(s->L+1, 1);
		memcpy(seq, s->seq, s->L);
		hd = hmm_new_data(s->L, seq, hp);
		hmm_forward(hp, hd);
		hmm_backward(hp, hd);
		if (!(pp->flag & PSMC_F_FULLDEC) && (pp->flag & PSMC_F_DECODE)) { // posterior decoding
			int *x, kl;
			hmm_post_decode(hp, hd);
			/* show path */
			x = hd->p;
			start = 1; prev = x[1];
			p = hd->f[1][prev] * hd->b[1][prev] * hd->s[1];
			for (k = 2; k <= s->L; ++k) {
				if (prev != x[k]) {
					kl = pp->par_map[prev];
					fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.5f\t%.5f\t%.5f\n", s->name, start, k-1, kl,
							t_min[kl], t2[kl], kl == pp->n_free-1? pp->max_t * 2. : t_min[kl+1]);
//					fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.3lf\t%.2lf\n", s->name, start, k-1, prev, t[prev], p);
					prev = x[k]; start = k; p = 0.0;
				}
				q = hd->f[k][x[k]] * hd->b[k][x[k]] * hd->s[k];
				if (p < q) p = q;
			}
//			fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.3lf\t%.2lf\n", s->name, start, k-1, prev, t[prev], p);
			kl = pp->par_map[prev];
			fprintf(pp->fpout, "DC\t%s\t%d\t%d\t%d\t%.5f\t%.5f\t%.5f\n", s->name, start, k-1, kl,
					t_min[kl], t2[kl], kl == pp->n_free-1? pp->max_t * 2. : t_min[kl+1]);
			fflush(pp->fpout);
		} else if (pp->flag & PSMC_F_DECODE) { // full decoding
			FLOAT *prob = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
			for (k = 1; k <= s->L; ++k) {
				int l;
				FLOAT p, *fu, *bu1, *eu1; // p is the recombination probability?
				if (k < s->L) {
					p = 0.0; fu = hd->f[k]; bu1 = hd->b[k+1]; eu1 = hp->e[(int)hd->seq[k+1]];
					for (l = 0; l < hp->n; ++l)
						p += fu[l] * hp->a[l][l] * bu1[l] * eu1[l];
					p = 1.0 - p;
				} else p = 0.0;
				hmm_post_state(hp, hd, k, prob);
				fprintf(pp->fpout, "DF\t%d\t%lf", k, p);
				for (l = 0; l < hp->n; ++l)
					fprintf(pp->fpout, "\t%.4f", prob[l]);
				fprintf(pp->fpout, "\n");
			}
			free(prob);
		}
		if (pp->fpcnt) { // very similar to full decoding above
			int32_t *cnt1, l;
			FLOAT *prob = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
			fread(&l, 4, 1, pp->fpcnt);
			assert(l >= s->L); // FIXME: if there are very short sequence in the input, fpcnt may be different from the input!!!
			cnt1 = malloc(l * n_cnt * 4);
			fread(cnt1, n_cnt * l, 4, pp->fpcnt);
			for (k = 1; k <= s->L; ++k) {
				int j, l;
				hmm_post_state(hp, hd, k, prob);
				for (l = 0; l < hp->n; ++l)
					for (j = 0; j < n_cnt; ++j) 
						cnt[l*n_cnt + j] += prob[l] * cnt1[(k-1)*n_cnt + j];
			}
			free(cnt1); free(prob);
		}
		/* free */
		hmm_delete_data(hd);
		free(seq);
	}
	if (pp->fpcnt) {
		for (i = 0; i < hp->n; ++i) {
			fprintf(pp->fpout, "CT\t%d", i);
			for (k = 0; k < n_cnt; ++k)
				fprintf(pp->fpout, "\t%f", cnt[i*n_cnt + k]);
			fprintf(pp->fpout, "\n");
		}
	}
	free(t); free(t2); free(t_min); free(cnt);
}
Example #2
0
// method:
//	hmm_train
//
// description:
//
double hmm_train( HiddenMarkovModel_t * hmm, Vector_t * observations[], int count, int iterations, double tolerance )
{
	assert( hmm != 0 );
	double new_likelihood = 0.0;
	if( ( iterations != 0 ) || ( tolerance != 0.0 ) )
	{
		//int N = sizeof( observations ) / sizeof( Vector_t * );
		int N = count;
		int current_iteration = 1;
		int stop = 0;
		
		// initialize epsilon (aka, ksi or psi) and gamma
		Matrix_t * epsilon[ N ];
		Matrix_t * gamma[ N ];
		for( int i = 0; i < N; i++ )
		{
			int T = mat_xsize( observations[ i ] );
			epsilon[ i ] = mat_allocate3d( T, hmm->_states, hmm->_states );
			gamma[ i ] = mat_allocate2d( T, hmm->_states );
		}

		// initial log likelihood
		double old_likelihood = 0.0;
		
		// train until done (max iterations or converged within tolerance)
		do
		{
			// train for each sequence in observations
			for( int i = 0; i < N; i++ )
			{
				Vector_t * sequence = observations[ i ];
				int T = mat_xsize( sequence );
				Vector_t * scaling = 0;
				
				// (a) calculate forward and backward probability
				Matrix_t * fwd = hmm_forward( hmm, sequence, &scaling );
				Matrix_t * bwd = hmm_backward( hmm, sequence, scaling );
				
				// (b) calculate the frequency of the transition-emission pair valus
				//     and divide by the probability of the entire sequence
				//
				// calculate gamma
				for( int t = 0; t < T; t++ )
				{
					double s = 0.0;
					for( int k = 0; k < hmm->_states; k++ )
					{
						double gv = mat_get2d( fwd, t, k ) * mat_get2d( bwd, t, k );
						mat_set2d( gamma[ i ], gv, t, k );
						s += gv;
					}
					if( s != 0.0 )
					{
						for( int k = 0; k < hmm->_states; k++ )
						{
							double gv = mat_get2d( gamma[ i ], t, k );
							mat_set2d( gamma[ i ], (gv / s), t, k );
						}
					}
				}
				
				// calculate epsilon 
				for( int t = 0; t < T - 1; t++ )
				{
					double s = 0.0;
					for( int k = 0; k < hmm->_states; k++ )
					{
						for( int l = 0; l < hmm->_states; l++ )
						{
							int next_symbol = (int)(mat_get1d( sequence, t + 1 ));
							double gv = mat_get2d( fwd, t, k ) * mat_get2d( bwd, t + 1, l );
							double ev = gv * mat_get2d( hmm->_A, k, l ) * mat_get2d( hmm->_B, l, next_symbol );
							mat_set3d( epsilon[ i ], ev, t, k, l );
							s += ev;
						}
					}
					if( s != 0.0 )
					{
						for( int k = 0; k < hmm->_states; k++ )
						{
							for( int l = 0; l < hmm->_states; l++ )
							{
								double ev = mat_get3d( epsilon[ i ], t, k, l );
								mat_set3d( epsilon[ i ], (ev / s ), t, k, l );
							}
						}
					}
				}
				
				// calculate log likelihood
				for( int t = 0; t < mat_xsize( scaling ); t++ )
				{
					new_likelihood += log( mat_get1d( scaling, t ) );
				}
				
				// free working fwd, bwd and scaling matrix
				mat_deallocate( fwd );
				mat_deallocate( bwd );
				mat_deallocate( scaling );
				scaling = 0;
			}
		
			// average likelihood
			new_likelihood /= (double)N;
			
			// check for convergence
			if( hmm_has_converged( old_likelihood, new_likelihood, current_iteration, iterations, tolerance ) != 0 )
			{
				stop = 1;
			}
			else
			{
				// (c) calculate parameter re-estimation
				++current_iteration;
				old_likelihood = new_likelihood;
				new_likelihood = 0.0;
				
				// re-estimate initial state
				for( int k = 0; k < hmm->_states; k++ )
				{
					double s = 0.0;
					for( int i = 0; i < N; i++ )
					{
						s += mat_get2d( gamma[ i ], 0, k );
					}
					mat_set1d( hmm->_pi, (s / N), k );
				}
				
				// re-estimate transition probabilities
				for( int i = 0; i < hmm->_states; i++ )
				{
					for( int j = 0; j < hmm->_states; j++ )
					{
						double den = 0.0;
						double num = 0.0;
						for( int k = 0; k < N; k++ )
						{
							int T = mat_xsize( observations[ k ] );
							for( int l = 0; l < T - 1; l++ )
							{
								double ev = mat_get3d( epsilon[ k ], l, i, j );
								double gv = mat_get2d( gamma[ k ], l, i );
								num += ev;
								den += gv;
							}
						}
						double av = (den != 0.0) ? num / den : 0.0;
						mat_set2d( hmm->_A, av, i, j );
					}
				}
				
				// re-estimation emission probabilities
				for( int i = 0; i < hmm->_states; i++ )
				{
					for( int j = 0; j < hmm->_symbols; j++ )
					{
						double den = 0.0;
						double num = 0.0;
						for( int k = 0; k < N; k++ )
						{
							int T = mat_xsize( observations[ k ] );
							for( int l = 0; l < T; l++ )
							{
								double gv = mat_get2d( gamma[ k ], l, i );
								int ov = (int)(mat_get1d( observations[ k ], l ));
								if( ov == j ) num += gv;
								den += gv;
							}
						}
						double bv = (num == 0.0) ? 1e-10 : num / den;
						mat_set2d( hmm->_B, bv, i, j );
					}
				}
			}
		}
		while( stop == 0 );
		
		// free epsilon and gamma
		for( int i = 0; i < N; i++ )
		{
			mat_deallocate( epsilon[ i ] );
			mat_deallocate( gamma[ i ] );
		}
	}
	return new_likelihood;
}