コード例 #1
0
static
void
increment_het_ratio_lhood(const starling_options& opt,
                          const starling_deriv_options& dopt,
                          const starling_sample_options& sample_opt,
                          const double indel_error_lnp,
                          const double indel_real_lnp,
                          const double ref_error_lnp,
                          const double ref_real_lnp,
                          const indel_key& ik,
                          const indel_data& id,
                          const double het_ratio,
                          const bool is_tier2_pass,
                          const bool is_use_alt_indel,
                          double* const lhood) {

    // high and low allele ratio variants:
    double het_lhood_high;
    double het_lhood_low;

    indel_digt_caller::get_high_low_het_ratio_lhood(opt,dopt,sample_opt,
                                                    indel_error_lnp,indel_real_lnp,
                                                    ref_error_lnp,ref_real_lnp,
                                                    ik,id,het_ratio,is_tier2_pass,
                                                    is_use_alt_indel,
                                                    het_lhood_high,het_lhood_low);

    lhood[STAR_DIINDEL::HET] = log_sum(lhood[STAR_DIINDEL::HET],het_lhood_low);
    lhood[STAR_DIINDEL::HET] = log_sum(lhood[STAR_DIINDEL::HET],het_lhood_high);
}
// accelerated version with no hyrax q-val mods:
//
static
void
increment_het_ratio_lhood_spi(const snp_pos_info& pi,
                              const blt_float_t het_ratio,
                              const unsigned het_ratio_index,
                              het_ratio_cache<3>& hrcache,
                              blt_float_t* all_het_lhood) {

    // multiply probs of alternate ratios into local likelihoods, then
    // *add* them to the global tally (effectively this is the sum lhood of
    // many different heterozygous genotypes).
    //
    // in the gt_high genotype, the first allele (in lexicographical
    // order) is expected at het_ratio and the second allele is
    // expected at chet_ratio.  gt_low genotype is vice versa.
    //
    blt_float_t lhood_high[DIGT::SIZE];
    blt_float_t lhood_low[DIGT::SIZE];
    for(unsigned gt(0); gt<DIGT::SIZE; ++gt) {
        lhood_high[gt] = 0.;
        lhood_low[gt] = 0.;
    }
    get_high_low_het_ratio_lhood_spi(pi,het_ratio,het_ratio_index,hrcache,lhood_high,lhood_low);

    for(unsigned gt(0); gt<DIGT::SIZE; ++gt) {
        if(not DIGT::is_het(gt)) continue;
        all_het_lhood[gt] = log_sum(all_het_lhood[gt],lhood_high[gt]);
        all_het_lhood[gt] = log_sum(all_het_lhood[gt],lhood_low[gt]);
    }
}
コード例 #3
0
ファイル: pmf.c プロジェクト: ilyak/wham
static void pmf_iteration(double *f, double *log_r)
{
	double work_sims[sim_count], work_bins[bin_count];
	int i, j;

	for (i = 0; i < bin_count; i++) {
		for (j = 0; j < sim_count; j++) {
			double bias, dx;

			if (period > 0) {
				dx = fabs(hist_x(i) - bias_x[j]);

				while (dx > 0.5 * period)
					dx -= period;

				bias = 0.5 * bias_k[j] * dx * dx;
			}
			else {
				dx = hist_x(i) - bias_x[j];
				bias = 0.5 * bias_k[j] * dx * dx;
			}
			work_sims[j] = log_nsim[j] + f[j] - beta * bias;
		}
		log_r[i] = log_nbin[i] - log_sum(work_sims, sim_count);
	}

	for (j = 0; j < sim_count; j++) {
		for (i = 0; i < bin_count; i++) {
			double bias, dx;

			if (period > 0) {
				dx = fabs(hist_x(i) - bias_x[j]);

				while (dx > 0.5 * period)
					dx -= period;

				bias = 0.5 * bias_k[j] * dx * dx;
			}
			else {
				dx = hist_x(i) - bias_x[j];
				bias = 0.5 * bias_k[j] * dx * dx;
			}
			work_bins[i] = log_r[i] - beta * bias;
		}
		f[j] = -log_sum(work_bins, bin_count);
	}
}
コード例 #4
0
ファイル: lda-inference.c プロジェクト: IcyLiGit/lda-c
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi)
{
    double converged = 1;
    double phisum = 0, likelihood = 0;
    double likelihood_old = 0, oldphi[model->num_topics];
    int k, n, var_iter;
    double digamma_gam[model->num_topics];

    // compute posterior dirichlet

    for (k = 0; k < model->num_topics; k++)
    {
        var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
        digamma_gam[k] = digamma(var_gamma[k]);
        for (n = 0; n < doc->length; n++)
            phi[n][k] = 1.0/model->num_topics;
    }
    var_iter = 0;

    while ((converged > VAR_CONVERGED) &&
           ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
    {
	var_iter++;
	for (n = 0; n < doc->length; n++)
	{
            phisum = 0;
            for (k = 0; k < model->num_topics; k++)
            {
                oldphi[k] = phi[n][k];
                phi[n][k] =
                    digamma_gam[k] +
                    model->log_prob_w[k][doc->words[n]];

                if (k > 0)
                    phisum = log_sum(phisum, phi[n][k]);
                else
                    phisum = phi[n][k]; // note, phi is in log space
            }

            for (k = 0; k < model->num_topics; k++)
            {
                phi[n][k] = exp(phi[n][k] - phisum);
                var_gamma[k] =
                    var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
                // !!! a lot of extra digamma's here because of how we're computing it
                // !!! but its more automatically updated too.
                digamma_gam[k] = digamma(var_gamma[k]);
            }
        }

        likelihood = compute_likelihood(doc, model, phi, var_gamma);
        assert(!isnan(likelihood));
        converged = (likelihood_old - likelihood) / likelihood_old;
        likelihood_old = likelihood;

        // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
    }
    return(likelihood);
}
コード例 #5
0
double log_sum(const gsl_vector* x) {
  double sum = gsl_vector_get(x, 0);

  for (unsigned int ii = 1; ii < x->size; ii++) {
    sum = log_sum(sum, gsl_vector_get(x, ii));
  }
  return sum;
}
コード例 #6
0
double log_dot_product(const gsl_vector* log_a, const gsl_vector* log_b) {
  double sum = gsl_vector_get(log_a, 0) + gsl_vector_get(log_b, 0);
  assert(log_a->size == log_b->size);
  for (unsigned int ii = 1; ii < log_a->size; ++ii) {
    sum = log_sum(sum, gsl_vector_get(log_a, ii) +
		       gsl_vector_get(log_b, ii));
  }
  return sum;
}
コード例 #7
0
static
double
integrate_out_sites(const starling_deriv_options& dopt,
                    const uint16_t nsite,
                    const double p_on_site,
                    const bool is_tier2_pass) {

    return log_sum((p_on_site + dopt.site_lnprior),
                   (dopt.get_nonsite_path_lnp(is_tier2_pass,nsite) + dopt.nonsite_lnprior));
}
コード例 #8
0
/*
 * normalize a vector in log space
 *
 * x_i = log(a_i)
 * v = log(a_1 + ... + a_k)
 * x_i = x_i - v
 *
 */
void log_normalize(gsl_vector* x) {
	double v = vget(x, 0);
	for (unsigned int i = 1; i < x->size; i++) {
		v = log_sum(v, vget(x, i));
	}
	
	for (unsigned int i = 0; i < x->size; i++) {
		vset(x, i, vget(x,i)-v);
	}
}
コード例 #9
0
ファイル: util.cpp プロジェクト: Nikraaaazy/bosen
float log_sum_vec(float * logvec,int D)
{
	float sum=0;
	sum=logvec[0];
	for(int i=1;i<D;i++)
	{
		sum=log_sum(sum,logvec[i]);
	}
	return sum;
}
コード例 #10
0
ファイル: opt.cpp プロジェクト: Aurametrix/Alg-C
double softmax_f(const gsl_vector * x, void * opt_param)
{
    opt_parameter * gsl_param = (opt_parameter *)opt_param;
    double PENALTY = gsl_param->PENALTY;
    slda * model = gsl_param->model;
    suffstats * ss = gsl_param->ss;

    double f, t, a1 = 0.0, a2 = 0.0;

    int k, d, j, l, idx;

    double f_regularization = 0.0;


    for (l = 0; l < model->num_classes-1; l ++)
    {
        for (k = 0; k < model->num_topics; k ++)
        {
            model->eta[l][k] = gsl_vector_get(x, l*model->num_topics + k);
            f_regularization -= pow(model->eta[l][k], 2) * PENALTY/2.0;
        }
    }
    f = 0.0; //log likelihood
    for (d = 0; d < ss->num_docs; d ++)
    {
        for (k = 0; k < model->num_topics; k ++)
        {
            if (ss->labels[d] < model->num_classes-1)
            {
                f += model->eta[ss->labels[d]][k] * ss->z_bar[d].z_bar_m[k];
            }
        }

        t = 0.0; // in log space,  1+exp()+exp()...
        for (l = 0; l < model->num_classes-1; l ++)
        {
            a1 = 0.0; // \eta_k^T * \bar{\phi}_d
            a2 = 0.0; // 1 + 0.5 * \eta_k^T * Var(z_bar)\eta_k
            for (k = 0; k < model->num_topics; k ++)
            {
                a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k];
                for (j = 0; j < model->num_topics; j ++)
                {
                    idx = map_idx(k, j, model->num_topics);
                    a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j];
                }
            }
            a2 = 1.0 + 0.5 * a2;
            t = log_sum(t, a1 + log(a2));
        }
        f -= t; 
    }

    return -(f + f_regularization);
}
コード例 #11
0
ファイル: var_bayes.cpp プロジェクト: DylanV/lda
double var_bayes::inference(const document &doc, std::vector<double>& var_gamma,
                            std::vector<std::vector<double>>& phi) {

    std::vector<double> digamma_gam(numTopics);

    for(int k=0; k<numTopics; k++){
        var_gamma[k] = alpha.alpha[k] + doc.count/numTopics;
    }

    int iteration = 0;
    double converged = 1;
    double phisum;
    std::vector<double> prev_gamma = std::vector<double>(numTopics);

    while((converged > INF_CONV_THRESH) and (iteration < INF_MAX_ITER)){
        iteration++;

        for(int k=0; k<numTopics; k++){
            digamma_gam[k] = digamma(var_gamma[k]);
            prev_gamma[k] = var_gamma[k];
            var_gamma[k] = alpha.alpha[k];
        }

        int n=0;
        for(auto const& word_count : doc.wordCounts){
            phisum = 0;
            for(int k=0; k<numTopics; k++){
                phi[n][k] = digamma_gam[k] + logProbW[k][word_count.first];

                if(k>0){
                    phisum = log_sum(phisum, phi[n][k]);
                } else {
                    phisum = phi[n][k];
                }
            }
            // Estimate gamma and phi
            for(int k=0; k<numTopics; k++){
                phi[n][k] = exp(phi[n][k] - phisum);
                var_gamma[k] += word_count.second*(phi[n][k]);
            }
            n++;
        }

        converged = 0;
        for(int k=0; k<numTopics; ++k){
            converged += fabs(prev_gamma[k] - var_gamma[k]);
        }
        converged /= numTopics;
    }

    return compute_likelihood(doc, var_gamma, phi);;
}
// compute just the non-strand-bias portion of the normal marginal
// prior given p(signal), p(no-strand noise), p(strand-bias noise)
//
static
void
get_nostrand_marginal_prior(const blt_float_t* normal_lnprior,
                            const unsigned ref_gt,
                            const blt_float_t sse_rate,
                            const blt_float_t sseb_fraction,
                            std::vector<blt_float_t>& grid_normal_lnprior) {

    const blt_float_t strand_sse_rate(sse_rate*sseb_fraction);
    const blt_float_t nostrand_sse_rate(sse_rate-strand_sse_rate);

    const blt_float_t ln_csse_rate( log1p_switch(-sse_rate) );
    //    const blt_float_t ln_strand_sse_rate( std::log(strand_sse_rate) );
    const blt_float_t ln_nostrand_sse_rate( std::log(nostrand_sse_rate) );

    // fill in normal sample prior for canonical diploid allele frequencies:
    for(unsigned ngt(0); ngt<DIGT::SIZE; ++ngt) {
        grid_normal_lnprior[ngt] = (normal_lnprior[ngt]+ln_csse_rate);
    }

    // nostrand noise prior distributions for each allele combination axis:
    //
    // weight the prior by the potential originating genotypes:
    // if on AB axis, we want P(AA+noiseB)+P(AB+noise)+P(BB+noiseA)
    // so we have P(AA)*error_prob /3 + P(AB)*error_prob + P(BB)*error_prob/3
    //
    static const unsigned n_het_axes(6);
    blt_float_t nostrand_axis_prior[n_het_axes];
    for(unsigned ngt(N_BASE); ngt<DIGT::SIZE; ++ngt) {
        const unsigned axis_id(ngt-N_BASE);
        nostrand_axis_prior[axis_id] = normal_lnprior[ngt];
        // get the two associated homs:
        for(unsigned b(0); b<N_BASE; ++b) {
            if(DIGT::expect2(b,ngt)<=0) continue;
            nostrand_axis_prior[axis_id] = log_sum(nostrand_axis_prior[axis_id],
                                                   normal_lnprior[b]+ln_one_third);
        }
    }

    static const blt_float_t error_mod( -std::log(static_cast<blt_float_t>(DIGT_SGRID::HET_RES*2)) );

    // fill in normal sample prior for 'noise' frequencies:
    for(unsigned ngt(DIGT::SIZE); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) {
        // 'ngt2' is the root diploid state corresponding to noise
        // state 'ngt'
        const unsigned ngt2(DIGT_SGRID::get_digt_state(ngt,ref_gt));
        assert(ngt2>=N_BASE);
        const unsigned axis_id(ngt2-N_BASE);
        grid_normal_lnprior[ngt] = (nostrand_axis_prior[axis_id]+ln_nostrand_sse_rate+error_mod);
        //        grid_normal_lnprior[ngt] = (normal_lnprior[ngt2]+ln_sse_rate+error_mod);
    }
}
コード例 #13
0
// Given a log vector, log a_i, compute log sum a_i.  Returns the sum.
double log_normalize(gsl_vector* x) {
  double sum = gsl_vector_get(x, 0);
  unsigned int i;

  for (i = 1; i < x->size; i++) {
    sum = log_sum(sum, gsl_vector_get(x, i));
  }

  for (i = 0; i < x->size; i++) {
    double val = gsl_vector_get(x, i);
    gsl_vector_set(x, i, val - sum);
  }
  return sum;
}
コード例 #14
0
ファイル: stirln.cpp プロジェクト: rohit104/shdp
/*
 * return the log of the stirling number log(s(n,m))
 * s(n, n) = 1
 * s(n, 0) = 0 if n > 0
 * s(n, m) = 0 if n < m
 * s(n+1, m) = s(n, m-1) + ns(n, m)
 */
double Stirling::get_log_stirling_num(size_t n, size_t m) {
  if (n < m)  return log_zero;
  size_t start = log_stirling_num_.size();
  for (size_t i = start; i < n+1; ++i) {
    double* v = new double[i+1];
    for (size_t j = 0; j < i + 1; ++j) { v[j] = log_zero; }
    log_stirling_num_.push_back(v);
    log_stirling_num_[i][i] = 0.0;
    for (size_t j = 1; j < i; ++j) {
      log_stirling_num_[i][j] = 
           log_sum(log_stirling_num_[i-1][j-1], 
             log(i-1) + log_stirling_num_[i-1][j]);
    }
  }
  return log_stirling_num_[n][m];
}
コード例 #15
0
/*
 * returns the element randomly sampled from the log
 * probabilities in array (number is the number of elements)
 */
int log_sample(double* vals, int length) {
  double normalizer = safe_log(0.0);
  int ii;
  for (ii = 0; ii < length; ++ii) {
    normalizer = log_sum(normalizer, vals[ii]);
  }

  double val = 0, sum = 0, cutoff = (double)rand() / ((double)RAND_MAX + 1.0);
  for (ii = 0; ii < length; ++ii) {
    val = exp(vals[ii] - normalizer);
    sum += val;
    if (sum >= cutoff)
      break;
  }
  assert(ii < length);
  return ii;
}
コード例 #16
0
int log_vector_sample(std::vector<double> vals, int length) {
  double normalizer = safe_log(0.0);
  int ii = 0;
  assert(length > 0 && length <= (int)vals.size());
  for (ii = 0; ii < length; ++ii) {
    normalizer = log_sum(normalizer, vals[ii]);
  }

  double val = 0, sum = 0, cutoff = (double)rand() / ((double)RAND_MAX + 1.0);
  for (ii = 0; ii < length; ++ii) {
    val = exp(vals[ii] - normalizer);
    sum += val;
    if (sum >= cutoff)
      break;
  }
  assert(ii < length);
  return ii;
}
コード例 #17
0
ファイル: infer.c プロジェクト: RenqinCai/sctm
void infer_phi (sctm_data* data, sctm_params* params, sctm_latent* latent,
		sctm_counts* counts) {
	int k, v;
	double p1, p2, lognorm, r;

	for (k = 0; k < params->K; k++) {
		for (v = 0; v < data->V; v++) {
			counts->phiEta_v[k] -= latent->phi[k][v] * params->eta;
			counts->phi_k[v] -= latent->phi[k][v];

			if (counts->n_dij[k][v] > 0)
				latent->phi[k][v] = 1;
			else {
				p1 = lgamma(counts->phiEta_v[k] + params->eta)
						- lgamma(counts->phiEta_v[k] + params->eta
										+ counts->n_dijv[k]);

				p1 += log(params->nu_lambda / (double) data->V + counts->phi_k[v])
						- log(params->nu_lambda / (double) data->V
										+ params->lambda + params->K);

				if (counts->phiEta_v[k] > 0) {
					p2 = lgamma(counts->phiEta_v[k])
						- lgamma(counts->phiEta_v[k] + counts->n_dijv[k]);

					p2 += log(params->lambda + params->K - counts->phi_k[v] - 1)
						- log(params->nu_lambda / (double) data->V
										+ params->lambda + params->K);
					lognorm = log_sum(p1, p2);
				} else lognorm = p1;			

				r = myrand();

				if (log(r) + lognorm <= p1)
					latent->phi[k][v] = 1;
				else
					latent->phi[k][v] = 0;
			}

			counts->phiEta_v[k] += latent->phi[k][v] * params->eta;
			counts->phi_k[v] += latent->phi[k][v];
		}
	}
}
コード例 #18
0
// Given a log matrix, log a_i, compute log sum a_i.  Returns the sum.
double log_normalize_matrix(gsl_matrix* x) {
  double sum = gsl_matrix_get(x, 0, 0);

  for (size_t ii = 0; ii < x->size1; ++ii) {
    for (size_t jj = 0; jj < x->size2; ++jj) {
      if (ii == 0 && jj == 0) {
	continue;
      }
      sum = log_sum(sum, gsl_matrix_get(x, ii, jj));      
    }
  }

  for (size_t ii = 0; ii < x->size1; ++ii) {
    for (size_t jj = 0; jj < x->size2; ++jj) {
      double val = gsl_matrix_get(x, ii, jj);
      gsl_matrix_set(x, ii, jj, val - sum);
    }
  }
  return sum;
}
コード例 #19
0
void
indel_digt_caller::
get_indel_digt_lhood(const starling_options& opt,
                     const starling_deriv_options& dopt,
                     const starling_sample_options& sample_opt,
                     const double indel_error_prob,
                     const double ref_error_prob,
                     const indel_key& ik,
                     const indel_data& id,
                     const bool is_het_bias,
                     const double het_bias,
                     const bool is_tier2_pass,
                     const bool is_use_alt_indel,
                     double* const lhood) {

    static const double loghalf(-std::log(2.));

    for (unsigned gt(0); gt<STAR_DIINDEL::SIZE; ++gt) lhood[gt] = 0.;

    const bool is_breakpoint(ik.is_breakpoint());

    const double indel_error_lnp(std::log(indel_error_prob));
    const double indel_real_lnp(std::log(1.-indel_error_prob));
    const double ref_error_lnp(std::log(ref_error_prob));
    const double ref_real_lnp(std::log(1.-ref_error_prob));

    //    typedef read_path_scores::alt_indel_t::const_iterator aiter;

    typedef indel_data::score_t::const_iterator siter;
    siter it(id.read_path_lnp.begin()), it_end(id.read_path_lnp.end());
    for (; it!=it_end; ++it) {
        const read_path_scores& path_lnp(it->second);

        // optionally skip tier2 data:
        if ((! is_tier2_pass) && (! path_lnp.is_tier1_read)) continue;

        // get alt path lnp:
        double alt_path_lnp(path_lnp.ref);
#if 0
        if (is_use_alt_indel && path_lnp.is_alt &&
            (path_lnp.alt > alt_path_lnp)) {
            alt_path_lnp=path_lnp.alt;
        }
#else
        if (is_use_alt_indel and (not path_lnp.alt_indel.empty()) ) {
            typedef read_path_scores::alt_indel_t::const_iterator aiter;
            aiter j(path_lnp.alt_indel.begin()), j_end(path_lnp.alt_indel.end());
            for (; j!=j_end; ++j) {
                if (j->second>alt_path_lnp) alt_path_lnp=j->second;
            }
        }
#endif

        const double noindel_lnp(log_sum(alt_path_lnp+ref_real_lnp,path_lnp.indel+indel_error_lnp));
        const double hom_lnp(log_sum(alt_path_lnp+ref_error_lnp,path_lnp.indel+indel_real_lnp));

        // allele ratio convention is that the indel occurs at the
        // het_allele ratio and the alternate allele occurs at
        // (1-het_allele_ratio):

        double log_ref_prob(loghalf);
        double log_indel_prob(loghalf);
        if (not is_breakpoint) {
            static const double het_allele_ratio(0.5);
            get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank,
                                          ik,het_allele_ratio,log_ref_prob,log_indel_prob);
        }
        const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob));

        lhood[STAR_DIINDEL::NOINDEL] += integrate_out_sites(dopt,path_lnp.nsite,noindel_lnp,is_tier2_pass);
        lhood[STAR_DIINDEL::HOM]     += integrate_out_sites(dopt,path_lnp.nsite,hom_lnp,is_tier2_pass);
        lhood[STAR_DIINDEL::HET]     += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass);

#ifdef DEBUG_INDEL_CALL
        //log_os << std::setprecision(8);
        //log_os << "INDEL_CALL i,ref_lnp,indel_lnp,lhood(noindel),lhood(hom),lhood(het): " << i << " " << path_lnp.ref << " " << path_lnp.indel << " " << lhood[STAR_DIINDEL::NOINDEL] << " " << lhood[STAR_DIINDEL::HOM] << " " << lhood[STAR_DIINDEL::HET] << "\n";
#endif
    }


    if (is_het_bias) {
        // loop is currently setup to assume a uniform het ratio subgenotype prior
        const unsigned n_bias_steps(1+static_cast<unsigned>(het_bias/opt.het_bias_max_ratio_inc));
        const double ratio_increment(het_bias/static_cast<double>(n_bias_steps));
        for (unsigned step(0); step<n_bias_steps; ++step) {
            const double het_ratio(0.5+(step+1)*ratio_increment);
            increment_het_ratio_lhood(opt,dopt,sample_opt,
                                      indel_error_lnp,indel_real_lnp,
                                      ref_error_lnp,ref_real_lnp,
                                      ik,id,het_ratio,is_tier2_pass,is_use_alt_indel,lhood);
        }

        const unsigned n_het_subgt(1+2*n_bias_steps);
        const double subgt_log_prior(std::log(static_cast<double>(n_het_subgt)));
        lhood[STAR_DIINDEL::HET] -= subgt_log_prior;
    }
}
コード例 #20
0
ファイル: doc_e_step.cpp プロジェクト: dokyum/tiLDA
double doc_e_step(const t_document* doc, const double* dirichlet_prior, double* nu,
		double** digamma_lambda, double* digamma_lambda_sum, const t_setting* setting,
		const int doc_id, double** rho, double* old_rho)
{
	const int& numtopics = setting->num_topics;
	const int& doclength = doc->length;
	const double doctotaloverk = (doc->total) / (double) (numtopics);

	// Initialize rho, nu
	for (int i = 0; i < numtopics; ++i) {
		for (int l = 0; l < doclength; ++l) {
			rho[l][i] = oneoverk;
		}
		nu[i] = dirichlet_prior[i] + doctotaloverk;
		digamma_nu[doc_id][i] = digamma(nu[i]);
	}

	int doc_loop = 0;
	double doc_likelihood = 0;
	double doc_likelihood_old = 0;
	double doc_converged = 1;
	double nu_sum = 0;
	double indep_part_likelihood = 0;
	double dep_part_likelihood = 0;

	while ((doc_loop < 2)
			|| ((doc_converged > setting->doc_converged) && (doc_loop < setting->doc_max_iter))) {
		doc_loop += 1;
		for (int l = 0; l < doclength; ++l) {
			double rhosum = 0;
			for (int i = 0; i < numtopics; ++i) {
				old_rho[i] = rho[l][i];
				rho[l][i] = digamma_nu[doc_id][i] + digamma_lambda[i][doc->words[l]] - digamma_lambda_sum[i];
				assert(rho[l][i] != 0);
				assert(!std::isnan(rho[l][i]));
				if (i > 0) {
					rhosum = log_sum(rhosum, rho[l][i]);
				} else {
					rhosum = rho[l][i];
				}
				assert(!std::isnan(rhosum));
			}
			for (int i = 0; i < numtopics; ++i) {
				rho[l][i] = exp(rho[l][i] - rhosum);
				nu[i] = nu[i] + (doc->counts[l]) * (rho[l][i] - old_rho[i]);
				// !!! a lot of extra digamma's here because of how we're computing it
				// !!! but its more automatically updated too.
				digamma_nu[doc_id][i] = digamma(nu[i]);
				assert(!std::isnan(digamma_nu[doc_id][i]));
			}
		}

		nu_sum = 0;
		for (int i = 0; i < numtopics; ++i) {
			nu_sum += nu[i];
		}
		digamma_nu_sum[doc_id] = digamma(nu_sum);

		indep_part_likelihood = -lgamma(nu_sum);
		dep_part_likelihood = 0;
		for (int i = 0; i < numtopics; ++i) {
			double delta = (digamma_nu[doc_id][i] - digamma_nu_sum[doc_id]);
			indep_part_likelihood += lgamma(nu[i]) - delta * nu[i];
			dep_part_likelihood += delta * dirichlet_prior[i];
			for (int l = 0; l < doclength; ++l) {
				if (rho[l][i] > 0) {
					indep_part_likelihood += rho[l][i] * (doc->counts[l])
							* (delta + digamma_lambda[i][doc->words[l]] - digamma_lambda_sum[i] - log(rho[l][i]));
				}
			}
		}
		assert(!std::isnan(indep_part_likelihood));
		assert(!std::isnan(dep_part_likelihood));
		doc_likelihood = indep_part_likelihood + dep_part_likelihood;

		doc_converged = (doc_likelihood_old - doc_likelihood) / doc_likelihood_old;
//		if (0 != doc_likelihood_old && doc_likelihood < doc_likelihood_old) {
//			printf("Warning: doc_likelihood is decreasing. doc_id: %d \t step: %d \t old: %.8f \t new: %.8f \t ratio: %.8f\n",
//					doc_id, doc_loop, doc_likelihood_old, doc_likelihood, doc_converged);
//		}

		assert((doc_loop == 1) || (doc_likelihood >= doc_likelihood_old)
				|| (((doc_likelihood_old - doc_likelihood) / fabs(doc_likelihood_old) < DOC_DECREASE_ALLOWANCE)
						&& (doc_loop >= 4)));


		doc_likelihood_old = doc_likelihood;
	}

	if (doc_loop >= setting->doc_max_iter) {
		printf("doc loop max reached %d\n", doc_id);
		exit(-1);
	}

	return indep_part_likelihood;
}
// calculate probability of strand-specific noise
//
// accelerated version with no hyrax q-val mods:
//
// the ratio key can be used as a proxy for the het ratio to look up cached results:
//
static
void
get_strand_ratio_lhood_spi(const snp_pos_info& pi,
                           const unsigned ref_gt,
                           const blt_float_t het_ratio,
                           const unsigned het_ratio_index,
                           het_ratio_cache<2>& hrcache,
                           blt_float_t* lhood) {

    // het_ratio is the expected allele frequency of noise on the
    // noise-strand, or "on-strand" below. All possible ratio values
    // (there should be very few), have an associated index value for
    // caching.
    //
    const blt_float_t chet_ratio(1.-het_ratio);

    const unsigned n_calls(pi.calls.size());


    // In this situation every basecall falls into 1 of 4 states:
    //
    // 0: off-strand non-reference allele (0)
    // 1: on-strand non-reference allele (het_ratio)  (cached)
    // 2: on-strand agrees with the reference (chet_ratio) (cached)
    // 3: off-strand agree with the reference (1)
    //
    // The off-strand states are not cached below because they're
    // simpler to compute
    //
    blt_float_t lhood_fwd[DIGT_SGRID::STRAND_SIZE]; // "on-strand" is fwd
    blt_float_t lhood_rev[DIGT_SGRID::STRAND_SIZE]; // "on-strand" is rev

    for(unsigned i(0); i<DIGT_SGRID::STRAND_SIZE; ++i) {
        lhood_fwd[i] = 0;
        lhood_rev[i] = 0;
    }

    static const unsigned n_strand_het_axes(3);

    for(unsigned i(0); i<n_calls; ++i) {
        const base_call& bc(pi.calls[i]);

        std::pair<bool,cache_val<2>*> ret(hrcache.get_val(bc.get_qscore(),het_ratio_index));
        cache_val<2>& cv(*ret.second);
        if(! ret.first) {
            const blt_float_t eprob(bc.error_prob());
            const blt_float_t ceprob(1.-eprob);
            // cached value [0] refers to state 2 above: on-strand
            // reference allele
            cv.val[0]=(std::log((ceprob)*chet_ratio+((eprob)*one_third)*het_ratio));
            // cached value [1] refers to state 1 above: on-strand
            // non-reference allele
            cv.val[1]=(std::log((ceprob)*het_ratio+((eprob)*one_third)*chet_ratio));
        }

        const uint8_t obs_id(bc.base_id);

        if(obs_id==ref_gt) {
            //const double val_onstr(std::log((ceprob)*chet_ratio+((eprob)*one_third)*het_ratio));
            const blt_float_t val_off_strand(bc.ln_comp_error_prob());
            const blt_float_t val_fwd(bc.is_fwd_strand ? cv.val[0] : val_off_strand);
            const blt_float_t val_rev(bc.is_fwd_strand ? val_off_strand : cv.val[0]);
            for(unsigned sgt(0); sgt<n_strand_het_axes; ++sgt) {
                lhood_fwd[sgt] += val_fwd;
                lhood_rev[sgt] += val_rev;
            }
        } else {
            //const double val_onstr(std::log((ceprob)*het_ratio+((eprob)*one_third)*chet_ratio));
            const blt_float_t val_off_strand(bc.ln_error_prob()+ln_one_third);
            const blt_float_t val_fwd(bc.is_fwd_strand ? cv.val[1] : val_off_strand);
            const blt_float_t val_rev(bc.is_fwd_strand ? val_off_strand : cv.val[1]);

            const unsigned match_strand_state(obs_id>ref_gt ? obs_id-1 : obs_id);
            for(unsigned sgt(0); sgt<n_strand_het_axes; ++sgt) {
                if(sgt==match_strand_state) {
                    lhood_fwd[sgt] += val_fwd;
                    lhood_rev[sgt] += val_rev;
                } else {
                    lhood_fwd[sgt] += val_off_strand;
                    lhood_rev[sgt] += val_off_strand;
                }
            }
        }
    }

    for(unsigned i(0); i<DIGT_SGRID::STRAND_SIZE; ++i) {
        lhood[i] = log_sum(lhood_fwd[i],lhood_rev[i])+ln_one_half;
    }
}
コード例 #22
0
ファイル: opt.cpp プロジェクト: Aurametrix/Alg-C
void softmax_df(const gsl_vector * x, void * opt_param, gsl_vector * df)
{

    opt_parameter * gsl_param = (opt_parameter *)opt_param;
    double PENALTY = gsl_param->PENALTY;
    slda * model = gsl_param->model;
    suffstats * ss = gsl_param->ss;
    gsl_vector_set_zero(df);
    gsl_vector * df_tmp = gsl_vector_alloc(df->size);

    double t, a1 = 0.0, a2 = 0.0, g;
    int k, d, j, l, idx;

    double * eta_aux = new double [model->num_topics];

    for (l = 0; l < model->num_classes-1; l ++)
    {
        for (k = 0; k < model->num_topics; k ++)
        {
            idx = l*model->num_topics + k;
            model->eta[l][k] = gsl_vector_get(x, idx); 
            g = -PENALTY * model->eta[l][k];
            gsl_vector_set(df, idx, g);
        }
    }
    for (d = 0; d < ss->num_docs; d ++)
    {
        for (k = 0; k < model->num_topics; k ++)
        {
            l = ss->labels[d];
            if (l < model->num_classes-1)
            {
                idx = l*model->num_topics + k;
                g = gsl_vector_get(df, idx) + ss->z_bar[d].z_bar_m[k];
                gsl_vector_set(df, idx, g);
            }
        }

        t = 0.0; // in log space, 1+exp()+exp()+....
        gsl_vector_memcpy(df_tmp, df);
        gsl_vector_set_zero(df);
        for (l = 0; l < model->num_classes-1; l ++)
        {
            memset(eta_aux, 0, sizeof(double)*model->num_topics);
            a1 = 0.0; // \eta_k^T * \bar{\phi}_d
            a2 = 0.0; // 1 + 0.5*\eta_k^T * Var(z_bar)\eta_k
            for (k = 0; k < model->num_topics; k ++)
            {
                a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k];
                for (j = 0; j < model->num_topics; j ++)
                {
                    idx = map_idx(k, j, model->num_topics);
                    a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j];
                    eta_aux[k] += ss->z_bar[d].z_bar_var[idx] * model->eta[l][j];
                }
            }
            a2 = 1.0 + 0.5 * a2;
            t = log_sum(t, a1 + log(a2));

            for (k = 0; k < model->num_topics; k ++)
            {
                idx = l*model->num_topics + k;
                g =  gsl_vector_get(df, idx) -
                     exp(a1) * (ss->z_bar[d].z_bar_m[k] * a2 + eta_aux[k]);
                gsl_vector_set(df, idx, g);
            }
        }
        gsl_vector_scale(df, exp(-t));
        gsl_vector_add(df, df_tmp);
    }
    gsl_vector_scale(df, -1.0);
    delete [] eta_aux;
    gsl_vector_free(df_tmp);
}
コード例 #23
0
ファイル: main.c プロジェクト: hsoleimani/STAT540Project
double doc_inference(vblda_corpus* corpus, vblda_model* model, vblda_ss* ss, int d, int test){

	int n, j, variter, w;
	double c, phisum, temp, cphi;
	double varlkh, prev_varlkh, conv;

	prev_varlkh = -1e100;
	conv = 0.0;
	variter = 0;
	do{
		varlkh = 0.0;
		for (n = 0; n < corpus->docs[d].length; n++){
			w = corpus->docs[d].words[n];
			c = (double) corpus->docs[d].counts[n];

			phisum = 0.0;
			for (j = 0; j < model->m; j++){
				ss->oldphi[j] = corpus->docs[d].phi[n][j];

				corpus->docs[d].phi[n][j] = gsl_sf_psi(model->gamma[j][d]) + model->psimu[j][w];
				if (j > 0)
					phisum = log_sum(phisum, corpus->docs[d].phi[n][j]);
				else
					phisum = corpus->docs[d].phi[n][j];
			}
			for (j = 0; j < model->m; j++){

				corpus->docs[d].phi[n][j] = exp(corpus->docs[d].phi[n][j] - phisum);

				temp = c*(corpus->docs[d].phi[n][j] - ss->oldphi[j]);
				model->gamma[j][d] += temp;
				ss->sumgamma[d] += temp;

				if (corpus->docs[d].phi[n][j] > 0){
					cphi = c*corpus->docs[d].phi[n][j];
					varlkh += cphi*(model->psimu[j][w]-log(corpus->docs[d].phi[n][j]));
				}
			}
		}
		varlkh -= lgamma(ss->sumgamma[d]);
		for (j = 0; j < model->m; j++){
			varlkh += lgamma(model->gamma[j][d]);
		}

		conv = fabs(prev_varlkh - varlkh)/fabs(prev_varlkh);
		if (prev_varlkh > varlkh){
			printf("ooops doc %d, %lf %lf, %5.10e\n", d, varlkh, prev_varlkh, conv);
		}
		prev_varlkh = varlkh;
		variter ++;

	}while((variter < MAXITER) && (conv > CONVERGED));

	if (test == 0){
		for (n = 0; n < corpus->docs[d].length; n++){
			w = corpus->docs[d].words[n];
			c = (double) corpus->docs[d].counts[n];
			for (j = 0; j < model->m; j++){
				cphi = corpus->docs[d].phi[n][j];
				varlkh -= cphi*model->psimu[j][w];
				ss->t[j][w] += cphi;
			}
		}
	}
	return(varlkh);

}
コード例 #24
0
void update_genotype_interval(vector<float> &genotype_interval, vector<float> &interval_cuts, PosteriorInference &local_posterior) {
  for (unsigned int i_cut = 0; i_cut < genotype_interval.size(); i_cut++) {
    genotype_interval[i_cut] = log_sum(genotype_interval[i_cut], local_posterior.gq_pair.LogDefiniteIntegral(interval_cuts[i_cut+1], interval_cuts[i_cut]) + local_posterior.params_ll);
  }
}
コード例 #25
0
ファイル: update_zeta.cpp プロジェクト: aacharya/DSLDA2
void update_zeta_i(mxArray *retptr, double *sstopicwordptr, double *ssfeatures, double *zetai, double *windexi, double *wcounti, const mxArray *model, const mxArray *data, const double *Esticks, const double *Eloglambda, const double *Eloggamma, const double *smallphi, const int ndistWords, const int i, double *count2, double *option)
{
    int index, nK1, nK2, t, k, k1, T, K1, K2, V, N, tempind1, tempind2, j, y, C2, Y, phase;
    double logsum, logsum1, logsum2, minval, val1, val2, val3, value, epsilon, valtemp;
    double *tmpptr, *tmp1, *log_beta, *dmu, *r, *annotations, *classlabels, *nwordspdoc;
    mxArray *tmp;
    
    minval      = mxGetScalar(mxGetField(model,0,"MINVALUE"));
    phase       = mxGetScalar(mxGetField(model,0,"phase"));
    classlabels = (double*)mxGetPr(mxGetField(data,0,"classlabels"));
    nwordspdoc  = (double*)mxGetPr(mxGetField(data,0,"nwordspdoc"));
    annotations = (double*)mxGetPr(mxGetField(data,0,"annotations"));
    
    r       = mxGetPr(mxGetField(model,0,"r"));
    V       = mxGetScalar(mxGetField(model,0,"V"));
    N       = mxGetScalar(mxGetField(model,0,"N"));
    C2      = mxGetScalar(mxGetField(model,0,"C2"));
    Y       = mxGetScalar(mxGetField(data,0,"Y"));
    T       = mxGetScalar(mxGetField(model,0,"T"));
    K1      = mxGetScalar(mxGetField(model,0,"K1"));
    K2      = mxGetScalar(mxGetField(model,0,"K2"));
    epsilon = mxGetScalar(mxGetField(model,0,"epsilon"));
    
    if((int)option[0]==1)
    {
        nK1 = (T+K2);
        nK2 = (K1+K2);
    }
    else
    {
        nK1 = T;
        nK2 = K1;
    }
    
    if(phase==1)
    {
        // use the dual variable only in training phase; no dual variable in test phase
        dmu     = (double*)mxGetPr(mxGetField(model,0,"dmu"));
    }
    
    tmp    = mxCreateDoubleMatrix(ndistWords,nK1,mxREAL);
    tmpptr = mxGetPr(tmp);
    
    for (j=0; j<ndistWords; j++)  // loop over (distinct) words
    {
        logsum  = 0;
        logsum1 = 0;
        logsum2 = 0;
        
        tmp1 = Malloc(double,nK1);
        
        for (k=0; k<nK1; k++)     // loop over third dimension
        {
            tempind1 = k + ((int)(windexi[j])-1)*nK2;
            val1     = 0;
            
            ////////////////////////////////////////////////////////////////////////////////////////////
            // terms from document level supervision
            if(phase==1 && (int)classlabels[i]>=1)   // use the dual variable only in training phase only when label is present; no dual variable in test phase;
            {
                if(k<T)          // for unsupervised topics
                {
                    for (y=0; y<Y; y++)
                    {
                        valtemp = 0;
                        for (k1=0; k1<K1; k1++)
                        {
                            valtemp = valtemp + (r[k1*Y+ (int)classlabels[i]-1] - r[k1*Y+y])*smallphi[i+k*N+k1*N*T];
                        }
                        val1 = val1 + dmu[i+y*N]*valtemp;
                    }
                }
                else   // for NPDSLDA
                    if((int)option[0]==1)// for supervised topics
                    {
                    for (y=0; y<Y; y++)
                    {
                        val1 = val1 + dmu[i+y*N]*(r[(int)classlabels[i]-1 + (k-T+K1)*Y] - r[y + (k-T+K1)*Y]);
                    }
                    val1 =  val1/nwordspdoc[i];
                    }
            }
            
            ////////////////////////////////////////////////////////////////////////////////////////////
            // for other terms
            if(k<T) // for unsupersvised topics
            {
                val2 = 0;
                for(k1=0; k1<K1; k1++)
                {
                    tempind2 = k1 + ((int)(windexi[j])-1)*nK2;
                    val2     = val2 + smallphi[i+k*N+k1*N*T]*Eloglambda[tempind2];
                }
                val3 = Esticks[i+k*N];
                *(tmp1+k) = val1 + val2 + val3;
            }
            else    // for supersvised topics
                if((int)option[0]==1)  // for NPDSLDA
                {
                *(tmp1+k) = Eloggamma[i+k*N] + Eloglambda[tempind1] + val1;
                }
            
            
            if(phase==1) // only in training phase; no need to have any clause for test phase
            {
                if((int)option[0]==1) // NPDSLDA
                {
                    if (k<T)    // unsupervised topics
                        logsum1 = log_sum(*(tmp1+k),logsum1);
                    else        // supervised topics
                    {
                        if(*(annotations+(k-T)*N+i)==0);  //if condition says when to ignore phi's
                        else
                            logsum2 = log_sum(*(tmp1+k),logsum2);
                    }
                }
                else                  // NPLDA
                    logsum = log_sum(*(tmp1+k),logsum);
            }
        }
        
        // conversion from log space to real number
        for (k=0; k<nK1; k++)
        {
            if((int)option[0]==1) // NPDSLDA
            {
                if(k<T)     // unsupervised topics
                {
                    if(logsum1 - *(tmp1+k)>10000)
                        tmpptr[k*ndistWords+j] = minval;
                    if(logsum1 - *(tmp1+k)<10000)
                        tmpptr[k*ndistWords+j] = (1-epsilon)*exp(*(tmp1+k) - logsum1) + minval;
                }
                else        // supervised topics
                {
                    if(logsum2 - *(tmp1+k)>10000)
                        tmpptr[k*ndistWords+j] = minval;
                    if(logsum2 - *(tmp1+k)<10000)
                        tmpptr[k*ndistWords+j] = epsilon*exp(*(tmp1+k) - logsum2) + minval;
                    if (phase==1 && k>=T && *(annotations+(k-T)*N+i)==0) // only in training phase
                    {
                        tmpptr[k*ndistWords+j] = 0;  // DSLDA
                        //mexPrintf("%d %d %d hey here!\n", i, j, k);
                    }
                }
            }
            else  // NPLDA
            {
                if(logsum - *(tmp1+k)>10000)
                    tmpptr[k*ndistWords+j] = minval;
                if(logsum - *(tmp1+k)<10000)
                    tmpptr[k*ndistWords+j] = exp(*(tmp1+k) - logsum) + minval;
            }
        }
        
        free(tmp1);
    }
    
    // update sufficient statistics -- for both unsupervised and supervised topics
    for (k = 0; k < nK2; k++)
    {
        for (j = 0; j < ndistWords; j++)
        {
            value = 0;
            if(k<K1)  // unsupervised topics
            {
                for (t = 0; t < T; t++) // loop over topics
                {
                    //mexPrintf("hey here1! %d %d %d %d %f\n", k, i, ndistWords, t, windexi[j]);
                    value += smallphi[i+t*N+k*N*T]*tmpptr[j+t*ndistWords]; // smallphi{ntk1}*zeta{nmt}
                }
                value = wcounti[j]*value;
            }
            else    // for NPDSLDA
                if((int)option[0]==1)// supervised topics
                {
                value = wcounti[j]*tmpptr[j+(k-K1+T)*ndistWords]; // zeta{nmk2}
                }
            index = k + ((int)windexi[j]-1)*nK2;
            sstopicwordptr[index] += value;
            ssfeatures[i+k*N] += value;
        }
    }
    
    mxSetCell (retptr, i, tmp);
    
    return;
}
コード例 #26
0
ファイル: infer-cmnt.c プロジェクト: RenqinCai/sctm
void infer_xi(sctm_data* data, sctm_params* params, sctm_latent* latent,
		sctm_counts* counts) {
	int d, i, a, k, n;
	double p1, p2, p11, p22, norm;

	for (d = 0; d < data->D; d++) {
		documents* doc = &(data->docs[d]);
		for (i = 0; i < doc->C; i++) {
			comment* cmnt = &(doc->cmnts[i]);
			for (a = 0; a < doc->S; a++) {
				sentence* sent = &(doc->sents[a]);

				p1 = 0.;
				p2 = 0.;
				for (n = 0; n < cmnt->N; n++) {
					k = latent->y[d][i][n];
//					if (latent->t[d][i][n] == 0)
//						continue;
					if (latent->xi[d][i][a] == 1) {
						p11 = log(counts->m[d][i][k] * 1.0) - log(counts->m_k[d][i]*1.0);
						// if only article sentence (among selected) in which k topic occurs:
						if (counts->m_k[d][i] - sent->N == 0) p22 = -INFINITY;
						else p22 = log((counts->m[d][i][k] - counts->n_jv[d][a][k]) * 1.0)
								 - log((counts->m_k[d][i] - sent->N)*1.0);
					} else {
						p11 = log((counts->m[d][i][k] + counts->n_jv[d][a][k]) * 1.0)
								- log((counts->m_k[d][i] + sent->N)*1.0);
						// if only article sentence (among selected) in which y topic occurs:
						if (counts->m_k[d][i] == 0) p22 = -INFINITY;
						else p22 = log(counts->m[d][i][k] * 1.0) - log(counts->m_k[d][i]*1.0);
					}
					p1 += p11;
					p2 += p22;
					if (isinf(p2) && !isinf(p1)) break;
				}

				p1 += log(params->vr1 * 1.0) - log((params->vr1 + params->vr2)*1.0);
				p2 += log(params->vr2 * 1.0) - log((params->vr1 + params->vr2)*1.0);

				norm = 0;
				if (isinf(p2) && p2 < 0) {
					p1 = 1.0;
					p2 = 0.0;
				} else{
					norm = log_sum(p1,p2);
					p1 -= norm;
					p1 = exp(p1);
				}

				if (isnan(p1) || isinf(p1) || isinf(norm)) {
					printf("\nd:%d i:%d a:%d p1:%lf p2:%lf\n",d,i,a,p1,p2);
					debug("incorrect probs: infer_xi");
				}

				if (myrand() <= p1) {
					if (latent->xi[d][i][a] == 0) {
						for (k=0; k < params->K; k++)
							counts->m[d][i][k] += counts->n_jv[d][a][k];
						counts->m_k[d][i] += sent->N;
					}
					latent->xi[d][i][a] = 1;
				}
				else {
					if (latent->xi[d][i][a] == 1) {
						for (k=0; k < params->K; k++)
							counts->m[d][i][k] -= counts->n_jv[d][a][k];
						counts->m_k[d][i] -= sent->N;
					}
					latent->xi[d][i][a] = 0;
				}
				latent->xi_prob[d][i][a] = p1;
			} //a
		} //i
	} //d
}
コード例 #27
0
ファイル: gmm.c プロジェクト: czxxjtu/videosearch
void gmm_compute_p (int n, const float * v,
                    const gmm_t * g,
                    float * p,
                    int flags)
{
    if(n==0) return; /* sgemm doesn't like empty matrices */

    long i, j, l;
    double dtmp;
    long d=g->d, k=g->k;


    float * logdetnr = fvec_new(k);

    for (j = 0 ; j < k ; j++) {
        logdetnr[j] = -d / 2.0 * log (2 * M_PI);
        for (i = 0 ; i < d ; i++)
            logdetnr[j] -= 0.5 * log (g->sigma[j * d + i]);
    }

    /* compute all probabilities in log domain */

    /* compute squared Mahalanobis distances (result in p) */

    if(0) { /* simple & slow */
        for (i = 0 ; i < n ; i++) {
            for (j = 0 ; j < k ; j++) {
                dtmp = 0;
                for (l = 0 ; l < d ; l++) {
                    dtmp += sqr (v[i * d + l] - g->mu[j * d + l]) / g->sigma[j * d + l];
                }
                p[i * k + j] = dtmp;
            }
        }
    } else { /* complicated & fast */
        compute_mahalanobis_sqr(n,k,d,g->mu,g->sigma,v,p);
    }

    /* convert distances to probabilities, staying in the log domain
       until the very end */
    for (i = 0 ; i < n ; i++) {

        for (j = 0 ; j < k ; j++) {
            p[i * k + j] = logdetnr[j] - 0.5 * p[i * k + j];
            CHECKFINITE(p[i * k + j]);
        }

        /* at this point, we have p(x|ci) -> we want p(ci|x) */


        if(flags & GMM_FLAGS_NO_NORM) {     /* compute the normalization factor */

            dtmp=0;

        } else {

            dtmp = p[i * k + 0];

            if(flags & GMM_FLAGS_W)
                dtmp+=log(g->w[0]);

            for (j = 1 ; j < k ; j++) {
                double log_p=p[i * k + j];

                if(flags & GMM_FLAGS_W)
                    log_p+=log(g->w[j]);

                dtmp = log_sum (dtmp, log_p);
            }

            /* now dtmp contains the log of sums */
        }

        for (j = 0 ; j < k ; j++) {
            double log_norm=0;

            if(flags & GMM_FLAGS_W)
                log_norm=log(g->w[j])-dtmp;
            else
                log_norm=-dtmp;

            p[i * k + j] = exp (p[i * k + j] + log_norm);
            CHECKFINITE(p[i * k + j]);
        }

        //    printf ("p[%d] = ", i);
        //    fvec_print (p + i * k, k);
    }

    free(logdetnr);

}
コード例 #28
0
void
indel_digt_caller::
get_high_low_het_ratio_lhood(const starling_options& /*opt*/,
                             const starling_deriv_options& dopt,
                             const starling_sample_options& sample_opt,
                             const double indel_error_lnp,
                             const double indel_real_lnp,
                             const double ref_error_lnp,
                             const double ref_real_lnp,
                             const indel_key& ik,
                             const indel_data& id,
                             const double het_ratio,
                             const bool is_tier2_pass,
                             const bool is_use_alt_indel,
                             double& het_lhood_high,
                             double& het_lhood_low) {

    // handle het ratio and its complement in one step:
    const double chet_ratio(1.-het_ratio);

    const double log_het_ratio(std::log(het_ratio));
    const double log_chet_ratio(std::log(chet_ratio));

    const bool is_breakpoint(ik.is_breakpoint());

    het_lhood_high=0;
    het_lhood_low=0;

    //    typedef read_path_scores::alt_indel_t::const_iterator aiter;

    typedef indel_data::score_t::const_iterator siter;
    siter i(id.read_path_lnp.begin()), i_end(id.read_path_lnp.end());
    for (; i!=i_end; ++i) {
        const read_path_scores& path_lnp(i->second);

        // optionally skip tier2 data:
        if ((! is_tier2_pass) && (! path_lnp.is_tier1_read)) continue;

        // get alt path lnp:
        double alt_path_lnp(path_lnp.ref);
#if 0
        if (is_use_alt_indel && path_lnp.is_alt &&
            (path_lnp.alt > alt_path_lnp)) {
            alt_path_lnp=path_lnp.alt;
        }
#else
        if (is_use_alt_indel && (! path_lnp.alt_indel.empty()) ) {
            typedef read_path_scores::alt_indel_t::const_iterator aiter;
            aiter j(path_lnp.alt_indel.begin()), j_end(path_lnp.alt_indel.end());
            for (; j!=j_end; ++j) {
                if (j->second>alt_path_lnp) alt_path_lnp=j->second;
            }
        }
#endif

        const double noindel_lnp(log_sum(alt_path_lnp+ref_real_lnp,path_lnp.indel+indel_error_lnp));
        const double hom_lnp(log_sum(alt_path_lnp+ref_error_lnp,path_lnp.indel+indel_real_lnp));

        // allele ratio convention is that the indel occurs at the
        // het_allele ratio and the alternate allele occurs at
        // (1-het_allele_ratio):
        {
            double log_ref_prob(log_chet_ratio);
            double log_indel_prob(log_het_ratio);
            if (! is_breakpoint) {
                get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank,
                                              ik,het_ratio,log_ref_prob,log_indel_prob);
            }
            const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob));

            het_lhood_low += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass);
        }

        {
            double log_ref_prob(log_het_ratio);
            double log_indel_prob(log_chet_ratio);
            if (! is_breakpoint) {
                get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank,
                                              ik,chet_ratio,log_ref_prob,log_indel_prob);
            }
            const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob));

            het_lhood_high += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass);
        }
    }
}
コード例 #29
0
ファイル: ActiveSelection.cpp プロジェクト: aacharya/DSLDA2
void update_phin(double *temp_phin, const mxArray *phi_n, const double *windexn, const double *wcountn, const mxArray *model, const mxArray *data, const double *psigammaptr, const int n, const int phase, const double *annotations, int option)
{
    int ndistWords, nK, k1, k2, V, N, tempind, i, j, y, C2, Y; // number of words, maximum number of topics, maximum number of observed topics
    double logsum1, logsum2, minval, val, epsilon;
    mxArray *tmp;
    double *tmpptr, *tmp1, *log_beta, *mu, *eta, *classlabels;
    double *nwordspdoc = mxGetPr(mxGetField(data,0,"nwordspdoc"));
    
    minval = mxGetScalar(mxGetField(model,0,"MINVALUE"));
    ndistWords = mxGetM(phi_n);
    nK     = mxGetN(phi_n);
    tmp    = mxCreateDoubleMatrix(ndistWords,nK,mxREAL);
    tmpptr = mxGetPr(tmp);
    log_beta = mxGetPr(mxGetField(model,0,"log_beta"));
    V      = mxGetScalar(mxGetField(model,0,"V"));
    
    //mexPrintf("till here ok1\n");
    if(option>=3)
    {
        classlabels = mxGetPr(mxGetField(data,0,"classlabels"));
        mu      = mxGetPr(mxGetField(model,0,"mu"));
        eta     = mxGetPr(mxGetField(model,0,"eta"));
        C2 = (int)mxGetScalar(mxGetField(model,0,"C2"));
        Y  = (int)mxGetScalar(mxGetField(model,0,"Y"));
        if(option>=4)
        {
            k1 = mxGetScalar(mxGetField(model,0,"k1"));
            k2 = mxGetScalar(mxGetField(model,0,"k2"));
            epsilon = mxGetScalar(mxGetField(model,0,"epsilon"));
            //mexPrintf("\nY: %d\n",Y);
        }
    }
    
    int lowlimit, uplimit;
    if(option==5) // DSLDA-NSLT
    {
        lowlimit = k1 + ((int)classlabels[n]-1)*(k2/Y);
        uplimit  = k1 + ((int)classlabels[n])*(k2/Y)-1;
        //mexPrintf("%d %d %d %d %d %d\n", n, k1, k2, lowlimit, uplimit, (int)classlabels[n]);
    }
    //mexPrintf("\t %d acajcjac %d",n, ndistWords);
    for (i=0; i<ndistWords; i++)
    {
        logsum1 = 0;
        logsum2 = 0;
        tmp1 = Malloc(double,nK);
        //mexPrintf("%d %d %d %d till here ok2\n", n, i, windexn[i], ndistWords);
        
        for (j=0; j<nK; j++)
        {
            tempind = j + ((int)(windexn[i])-1)*nK;
            //mexPrintf("\n%d %d %d %d %f\n",n, i,j, nK, log_beta[tempind]);
            val = 0;
            if(unlclass[n]==1 && (int)classlabels[n]>=1)   // use the dual variable in training phase only when label is present; no dual variable in test phase;
            {
                for (y=0; y<Y; y++)
                    val = val + mu[y*N+n]*(eta[j*Y+ (int)classlabels[n]-1] - eta[j*Y+y]);
                val =  val/nwordspdoc[n];
            }
            
            //mexPrintf("\n%d %d %d %d %f\n",n, i,j, nK, log_beta[tempind]);
            if(option>=3)
                *(tmp1+j) = psigammaptr[j*N+n] + log_beta[tempind] + val;  // access (j,i) th element from gamma
            
            if(option==1 || option==2)
                *(tmp1+j) = psigammaptr[j*N+n] + log_beta[tempind];  // access (j,i) th element from gamma
            
            if(option!=1 && option!=3 && ((option>=4 && j<k1) || (option==2 && j<nK)))
            {
                // supervised topics for options other than 1(LDA) and 3(MedLDA); for option 2(LLDA), all the topics are supervised, so we need an extra clause
                    if(option==2 && *(unlattr+j*N+n)==0); // LLDA
                    else if (option==4 && j<k1 && *(unlattr+j*N+n)==0);  // DSLDA
                    else if (option==5 && ((j<k1 && *(unlattr+j*N+n)==0) || (j>=k1 && !(j>=lowlimit && j<=uplimit)))); // DSLDA-NSLT1
                    else if (option==7 && (j<k1 && *(unlattr+j*N+n)==0)); // DSLDA-OSST
                    else
                        logsum1    = log_sum(*(tmp1+j),logsum1);
            }
            else  // unsupervised topics (training and test phases are identical except for DSLDA-NSLT)
            {
                if(phase==1 && (int)classlabels[n]>=1 && option==5 && !(j>=lowlimit && j<=uplimit)); // skip in DSLDA-NSLT's training phase;
            }
        }
        
        // conversion from log space to real number
        for (j=0; j<nK; j++)
        {
            if(option!=1 && option!=3 && ((option>=4 && j<k1) || (option==2 && j<nK)))
            {
                // supervised topics for options other than 1(LDA) and 3(MedLDA); for option 2(LLDA), all the topics are supervised, so we need an extra clause
                    //if condition says when to ignore phi's for supervised topics in training phase
                    if(option==2 && *(unlattr+j*N+n)==0)
                        temp_phin[j*ndistWords+i] = 0; // LLDA
                    else if (option==4 && j<k1 && *(unlattr+j*N+n)==0)
                        temp_phin[j*ndistWords+i] = 0; // DSLDA
                    else if (option==5 && j<k1 && *(unlattr+j*N+n)==0)
                        temp_phin[j*ndistWords+i] = 0; // DSLDA-NSLT
                    else if (option==7 && (j<k1 && *(unlattr+j*N+n)==0))
                        temp_phin[j*ndistWords+i] = 0; // DSLDA-OSST
                    else if(logsum1 - *(tmp1+j)>100)
                        temp_phin[j*ndistWords+i] = minval;                                //(j,i) th element
                    else if(logsum1 - *(tmp1+j)<100)
                        temp_phin[j*ndistWords+i] = epsilon*exp(*(tmp1+j)-logsum1)+minval; //(j,i) th element
                    else; // do nothing
            }
            else  // unsupervised topics
            {
                if((int)classlabels[n]>=1 && option==5 && !(j>=lowlimit && j<=uplimit)) // DSLDA-NSLT skip in training phase only if the class label is present
                    tmpptr[j*ndistWords+i] = 0;
                else // no distinction between training and test phase except for DSLDA-NSLT
                {
                    if(logsum2 - *(tmp1+j)>100)
                        temp_phin[j*ndistWords+i] = minval;                        //(j,i) th element
                    if(logsum2 - *(tmp1+j)<100)
                        if(option>=4)
                            temp_phin[j*ndistWords+i] = (1-epsilon)*exp(*(tmp1+j)-logsum2)+minval; //(j,i) th element
                        else
                            temp_phin[j*ndistWords+i] = exp(*(tmp1+j)-logsum2)+minval; //(j,i) th element
                }
            }
        }
        free(tmp1);
    }     
    return;
}