static void increment_het_ratio_lhood(const starling_options& opt, const starling_deriv_options& dopt, const starling_sample_options& sample_opt, const double indel_error_lnp, const double indel_real_lnp, const double ref_error_lnp, const double ref_real_lnp, const indel_key& ik, const indel_data& id, const double het_ratio, const bool is_tier2_pass, const bool is_use_alt_indel, double* const lhood) { // high and low allele ratio variants: double het_lhood_high; double het_lhood_low; indel_digt_caller::get_high_low_het_ratio_lhood(opt,dopt,sample_opt, indel_error_lnp,indel_real_lnp, ref_error_lnp,ref_real_lnp, ik,id,het_ratio,is_tier2_pass, is_use_alt_indel, het_lhood_high,het_lhood_low); lhood[STAR_DIINDEL::HET] = log_sum(lhood[STAR_DIINDEL::HET],het_lhood_low); lhood[STAR_DIINDEL::HET] = log_sum(lhood[STAR_DIINDEL::HET],het_lhood_high); }
// accelerated version with no hyrax q-val mods: // static void increment_het_ratio_lhood_spi(const snp_pos_info& pi, const blt_float_t het_ratio, const unsigned het_ratio_index, het_ratio_cache<3>& hrcache, blt_float_t* all_het_lhood) { // multiply probs of alternate ratios into local likelihoods, then // *add* them to the global tally (effectively this is the sum lhood of // many different heterozygous genotypes). // // in the gt_high genotype, the first allele (in lexicographical // order) is expected at het_ratio and the second allele is // expected at chet_ratio. gt_low genotype is vice versa. // blt_float_t lhood_high[DIGT::SIZE]; blt_float_t lhood_low[DIGT::SIZE]; for(unsigned gt(0); gt<DIGT::SIZE; ++gt) { lhood_high[gt] = 0.; lhood_low[gt] = 0.; } get_high_low_het_ratio_lhood_spi(pi,het_ratio,het_ratio_index,hrcache,lhood_high,lhood_low); for(unsigned gt(0); gt<DIGT::SIZE; ++gt) { if(not DIGT::is_het(gt)) continue; all_het_lhood[gt] = log_sum(all_het_lhood[gt],lhood_high[gt]); all_het_lhood[gt] = log_sum(all_het_lhood[gt],lhood_low[gt]); } }
static void pmf_iteration(double *f, double *log_r) { double work_sims[sim_count], work_bins[bin_count]; int i, j; for (i = 0; i < bin_count; i++) { for (j = 0; j < sim_count; j++) { double bias, dx; if (period > 0) { dx = fabs(hist_x(i) - bias_x[j]); while (dx > 0.5 * period) dx -= period; bias = 0.5 * bias_k[j] * dx * dx; } else { dx = hist_x(i) - bias_x[j]; bias = 0.5 * bias_k[j] * dx * dx; } work_sims[j] = log_nsim[j] + f[j] - beta * bias; } log_r[i] = log_nbin[i] - log_sum(work_sims, sim_count); } for (j = 0; j < sim_count; j++) { for (i = 0; i < bin_count; i++) { double bias, dx; if (period > 0) { dx = fabs(hist_x(i) - bias_x[j]); while (dx > 0.5 * period) dx -= period; bias = 0.5 * bias_k[j] * dx * dx; } else { dx = hist_x(i) - bias_x[j]; bias = 0.5 * bias_k[j] * dx * dx; } work_bins[i] = log_r[i] - beta * bias; } f[j] = -log_sum(work_bins, bin_count); } }
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) { double converged = 1; double phisum = 0, likelihood = 0; double likelihood_old = 0, oldphi[model->num_topics]; int k, n, var_iter; double digamma_gam[model->num_topics]; // compute posterior dirichlet for (k = 0; k < model->num_topics; k++) { var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics)); digamma_gam[k] = digamma(var_gamma[k]); for (n = 0; n < doc->length; n++) phi[n][k] = 1.0/model->num_topics; } var_iter = 0; while ((converged > VAR_CONVERGED) && ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1))) { var_iter++; for (n = 0; n < doc->length; n++) { phisum = 0; for (k = 0; k < model->num_topics; k++) { oldphi[k] = phi[n][k]; phi[n][k] = digamma_gam[k] + model->log_prob_w[k][doc->words[n]]; if (k > 0) phisum = log_sum(phisum, phi[n][k]); else phisum = phi[n][k]; // note, phi is in log space } for (k = 0; k < model->num_topics; k++) { phi[n][k] = exp(phi[n][k] - phisum); var_gamma[k] = var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]); // !!! a lot of extra digamma's here because of how we're computing it // !!! but its more automatically updated too. digamma_gam[k] = digamma(var_gamma[k]); } } likelihood = compute_likelihood(doc, model, phi, var_gamma); assert(!isnan(likelihood)); converged = (likelihood_old - likelihood) / likelihood_old; likelihood_old = likelihood; // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged); } return(likelihood); }
double log_sum(const gsl_vector* x) { double sum = gsl_vector_get(x, 0); for (unsigned int ii = 1; ii < x->size; ii++) { sum = log_sum(sum, gsl_vector_get(x, ii)); } return sum; }
double log_dot_product(const gsl_vector* log_a, const gsl_vector* log_b) { double sum = gsl_vector_get(log_a, 0) + gsl_vector_get(log_b, 0); assert(log_a->size == log_b->size); for (unsigned int ii = 1; ii < log_a->size; ++ii) { sum = log_sum(sum, gsl_vector_get(log_a, ii) + gsl_vector_get(log_b, ii)); } return sum; }
static double integrate_out_sites(const starling_deriv_options& dopt, const uint16_t nsite, const double p_on_site, const bool is_tier2_pass) { return log_sum((p_on_site + dopt.site_lnprior), (dopt.get_nonsite_path_lnp(is_tier2_pass,nsite) + dopt.nonsite_lnprior)); }
/* * normalize a vector in log space * * x_i = log(a_i) * v = log(a_1 + ... + a_k) * x_i = x_i - v * */ void log_normalize(gsl_vector* x) { double v = vget(x, 0); for (unsigned int i = 1; i < x->size; i++) { v = log_sum(v, vget(x, i)); } for (unsigned int i = 0; i < x->size; i++) { vset(x, i, vget(x,i)-v); } }
float log_sum_vec(float * logvec,int D) { float sum=0; sum=logvec[0]; for(int i=1;i<D;i++) { sum=log_sum(sum,logvec[i]); } return sum; }
double softmax_f(const gsl_vector * x, void * opt_param) { opt_parameter * gsl_param = (opt_parameter *)opt_param; double PENALTY = gsl_param->PENALTY; slda * model = gsl_param->model; suffstats * ss = gsl_param->ss; double f, t, a1 = 0.0, a2 = 0.0; int k, d, j, l, idx; double f_regularization = 0.0; for (l = 0; l < model->num_classes-1; l ++) { for (k = 0; k < model->num_topics; k ++) { model->eta[l][k] = gsl_vector_get(x, l*model->num_topics + k); f_regularization -= pow(model->eta[l][k], 2) * PENALTY/2.0; } } f = 0.0; //log likelihood for (d = 0; d < ss->num_docs; d ++) { for (k = 0; k < model->num_topics; k ++) { if (ss->labels[d] < model->num_classes-1) { f += model->eta[ss->labels[d]][k] * ss->z_bar[d].z_bar_m[k]; } } t = 0.0; // in log space, 1+exp()+exp()... for (l = 0; l < model->num_classes-1; l ++) { a1 = 0.0; // \eta_k^T * \bar{\phi}_d a2 = 0.0; // 1 + 0.5 * \eta_k^T * Var(z_bar)\eta_k for (k = 0; k < model->num_topics; k ++) { a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k]; for (j = 0; j < model->num_topics; j ++) { idx = map_idx(k, j, model->num_topics); a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j]; } } a2 = 1.0 + 0.5 * a2; t = log_sum(t, a1 + log(a2)); } f -= t; } return -(f + f_regularization); }
double var_bayes::inference(const document &doc, std::vector<double>& var_gamma, std::vector<std::vector<double>>& phi) { std::vector<double> digamma_gam(numTopics); for(int k=0; k<numTopics; k++){ var_gamma[k] = alpha.alpha[k] + doc.count/numTopics; } int iteration = 0; double converged = 1; double phisum; std::vector<double> prev_gamma = std::vector<double>(numTopics); while((converged > INF_CONV_THRESH) and (iteration < INF_MAX_ITER)){ iteration++; for(int k=0; k<numTopics; k++){ digamma_gam[k] = digamma(var_gamma[k]); prev_gamma[k] = var_gamma[k]; var_gamma[k] = alpha.alpha[k]; } int n=0; for(auto const& word_count : doc.wordCounts){ phisum = 0; for(int k=0; k<numTopics; k++){ phi[n][k] = digamma_gam[k] + logProbW[k][word_count.first]; if(k>0){ phisum = log_sum(phisum, phi[n][k]); } else { phisum = phi[n][k]; } } // Estimate gamma and phi for(int k=0; k<numTopics; k++){ phi[n][k] = exp(phi[n][k] - phisum); var_gamma[k] += word_count.second*(phi[n][k]); } n++; } converged = 0; for(int k=0; k<numTopics; ++k){ converged += fabs(prev_gamma[k] - var_gamma[k]); } converged /= numTopics; } return compute_likelihood(doc, var_gamma, phi);; }
// compute just the non-strand-bias portion of the normal marginal // prior given p(signal), p(no-strand noise), p(strand-bias noise) // static void get_nostrand_marginal_prior(const blt_float_t* normal_lnprior, const unsigned ref_gt, const blt_float_t sse_rate, const blt_float_t sseb_fraction, std::vector<blt_float_t>& grid_normal_lnprior) { const blt_float_t strand_sse_rate(sse_rate*sseb_fraction); const blt_float_t nostrand_sse_rate(sse_rate-strand_sse_rate); const blt_float_t ln_csse_rate( log1p_switch(-sse_rate) ); // const blt_float_t ln_strand_sse_rate( std::log(strand_sse_rate) ); const blt_float_t ln_nostrand_sse_rate( std::log(nostrand_sse_rate) ); // fill in normal sample prior for canonical diploid allele frequencies: for(unsigned ngt(0); ngt<DIGT::SIZE; ++ngt) { grid_normal_lnprior[ngt] = (normal_lnprior[ngt]+ln_csse_rate); } // nostrand noise prior distributions for each allele combination axis: // // weight the prior by the potential originating genotypes: // if on AB axis, we want P(AA+noiseB)+P(AB+noise)+P(BB+noiseA) // so we have P(AA)*error_prob /3 + P(AB)*error_prob + P(BB)*error_prob/3 // static const unsigned n_het_axes(6); blt_float_t nostrand_axis_prior[n_het_axes]; for(unsigned ngt(N_BASE); ngt<DIGT::SIZE; ++ngt) { const unsigned axis_id(ngt-N_BASE); nostrand_axis_prior[axis_id] = normal_lnprior[ngt]; // get the two associated homs: for(unsigned b(0); b<N_BASE; ++b) { if(DIGT::expect2(b,ngt)<=0) continue; nostrand_axis_prior[axis_id] = log_sum(nostrand_axis_prior[axis_id], normal_lnprior[b]+ln_one_third); } } static const blt_float_t error_mod( -std::log(static_cast<blt_float_t>(DIGT_SGRID::HET_RES*2)) ); // fill in normal sample prior for 'noise' frequencies: for(unsigned ngt(DIGT::SIZE); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { // 'ngt2' is the root diploid state corresponding to noise // state 'ngt' const unsigned ngt2(DIGT_SGRID::get_digt_state(ngt,ref_gt)); assert(ngt2>=N_BASE); const unsigned axis_id(ngt2-N_BASE); grid_normal_lnprior[ngt] = (nostrand_axis_prior[axis_id]+ln_nostrand_sse_rate+error_mod); // grid_normal_lnprior[ngt] = (normal_lnprior[ngt2]+ln_sse_rate+error_mod); } }
// Given a log vector, log a_i, compute log sum a_i. Returns the sum. double log_normalize(gsl_vector* x) { double sum = gsl_vector_get(x, 0); unsigned int i; for (i = 1; i < x->size; i++) { sum = log_sum(sum, gsl_vector_get(x, i)); } for (i = 0; i < x->size; i++) { double val = gsl_vector_get(x, i); gsl_vector_set(x, i, val - sum); } return sum; }
/* * return the log of the stirling number log(s(n,m)) * s(n, n) = 1 * s(n, 0) = 0 if n > 0 * s(n, m) = 0 if n < m * s(n+1, m) = s(n, m-1) + ns(n, m) */ double Stirling::get_log_stirling_num(size_t n, size_t m) { if (n < m) return log_zero; size_t start = log_stirling_num_.size(); for (size_t i = start; i < n+1; ++i) { double* v = new double[i+1]; for (size_t j = 0; j < i + 1; ++j) { v[j] = log_zero; } log_stirling_num_.push_back(v); log_stirling_num_[i][i] = 0.0; for (size_t j = 1; j < i; ++j) { log_stirling_num_[i][j] = log_sum(log_stirling_num_[i-1][j-1], log(i-1) + log_stirling_num_[i-1][j]); } } return log_stirling_num_[n][m]; }
/* * returns the element randomly sampled from the log * probabilities in array (number is the number of elements) */ int log_sample(double* vals, int length) { double normalizer = safe_log(0.0); int ii; for (ii = 0; ii < length; ++ii) { normalizer = log_sum(normalizer, vals[ii]); } double val = 0, sum = 0, cutoff = (double)rand() / ((double)RAND_MAX + 1.0); for (ii = 0; ii < length; ++ii) { val = exp(vals[ii] - normalizer); sum += val; if (sum >= cutoff) break; } assert(ii < length); return ii; }
int log_vector_sample(std::vector<double> vals, int length) { double normalizer = safe_log(0.0); int ii = 0; assert(length > 0 && length <= (int)vals.size()); for (ii = 0; ii < length; ++ii) { normalizer = log_sum(normalizer, vals[ii]); } double val = 0, sum = 0, cutoff = (double)rand() / ((double)RAND_MAX + 1.0); for (ii = 0; ii < length; ++ii) { val = exp(vals[ii] - normalizer); sum += val; if (sum >= cutoff) break; } assert(ii < length); return ii; }
void infer_phi (sctm_data* data, sctm_params* params, sctm_latent* latent, sctm_counts* counts) { int k, v; double p1, p2, lognorm, r; for (k = 0; k < params->K; k++) { for (v = 0; v < data->V; v++) { counts->phiEta_v[k] -= latent->phi[k][v] * params->eta; counts->phi_k[v] -= latent->phi[k][v]; if (counts->n_dij[k][v] > 0) latent->phi[k][v] = 1; else { p1 = lgamma(counts->phiEta_v[k] + params->eta) - lgamma(counts->phiEta_v[k] + params->eta + counts->n_dijv[k]); p1 += log(params->nu_lambda / (double) data->V + counts->phi_k[v]) - log(params->nu_lambda / (double) data->V + params->lambda + params->K); if (counts->phiEta_v[k] > 0) { p2 = lgamma(counts->phiEta_v[k]) - lgamma(counts->phiEta_v[k] + counts->n_dijv[k]); p2 += log(params->lambda + params->K - counts->phi_k[v] - 1) - log(params->nu_lambda / (double) data->V + params->lambda + params->K); lognorm = log_sum(p1, p2); } else lognorm = p1; r = myrand(); if (log(r) + lognorm <= p1) latent->phi[k][v] = 1; else latent->phi[k][v] = 0; } counts->phiEta_v[k] += latent->phi[k][v] * params->eta; counts->phi_k[v] += latent->phi[k][v]; } } }
// Given a log matrix, log a_i, compute log sum a_i. Returns the sum. double log_normalize_matrix(gsl_matrix* x) { double sum = gsl_matrix_get(x, 0, 0); for (size_t ii = 0; ii < x->size1; ++ii) { for (size_t jj = 0; jj < x->size2; ++jj) { if (ii == 0 && jj == 0) { continue; } sum = log_sum(sum, gsl_matrix_get(x, ii, jj)); } } for (size_t ii = 0; ii < x->size1; ++ii) { for (size_t jj = 0; jj < x->size2; ++jj) { double val = gsl_matrix_get(x, ii, jj); gsl_matrix_set(x, ii, jj, val - sum); } } return sum; }
void indel_digt_caller:: get_indel_digt_lhood(const starling_options& opt, const starling_deriv_options& dopt, const starling_sample_options& sample_opt, const double indel_error_prob, const double ref_error_prob, const indel_key& ik, const indel_data& id, const bool is_het_bias, const double het_bias, const bool is_tier2_pass, const bool is_use_alt_indel, double* const lhood) { static const double loghalf(-std::log(2.)); for (unsigned gt(0); gt<STAR_DIINDEL::SIZE; ++gt) lhood[gt] = 0.; const bool is_breakpoint(ik.is_breakpoint()); const double indel_error_lnp(std::log(indel_error_prob)); const double indel_real_lnp(std::log(1.-indel_error_prob)); const double ref_error_lnp(std::log(ref_error_prob)); const double ref_real_lnp(std::log(1.-ref_error_prob)); // typedef read_path_scores::alt_indel_t::const_iterator aiter; typedef indel_data::score_t::const_iterator siter; siter it(id.read_path_lnp.begin()), it_end(id.read_path_lnp.end()); for (; it!=it_end; ++it) { const read_path_scores& path_lnp(it->second); // optionally skip tier2 data: if ((! is_tier2_pass) && (! path_lnp.is_tier1_read)) continue; // get alt path lnp: double alt_path_lnp(path_lnp.ref); #if 0 if (is_use_alt_indel && path_lnp.is_alt && (path_lnp.alt > alt_path_lnp)) { alt_path_lnp=path_lnp.alt; } #else if (is_use_alt_indel and (not path_lnp.alt_indel.empty()) ) { typedef read_path_scores::alt_indel_t::const_iterator aiter; aiter j(path_lnp.alt_indel.begin()), j_end(path_lnp.alt_indel.end()); for (; j!=j_end; ++j) { if (j->second>alt_path_lnp) alt_path_lnp=j->second; } } #endif const double noindel_lnp(log_sum(alt_path_lnp+ref_real_lnp,path_lnp.indel+indel_error_lnp)); const double hom_lnp(log_sum(alt_path_lnp+ref_error_lnp,path_lnp.indel+indel_real_lnp)); // allele ratio convention is that the indel occurs at the // het_allele ratio and the alternate allele occurs at // (1-het_allele_ratio): double log_ref_prob(loghalf); double log_indel_prob(loghalf); if (not is_breakpoint) { static const double het_allele_ratio(0.5); get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank, ik,het_allele_ratio,log_ref_prob,log_indel_prob); } const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob)); lhood[STAR_DIINDEL::NOINDEL] += integrate_out_sites(dopt,path_lnp.nsite,noindel_lnp,is_tier2_pass); lhood[STAR_DIINDEL::HOM] += integrate_out_sites(dopt,path_lnp.nsite,hom_lnp,is_tier2_pass); lhood[STAR_DIINDEL::HET] += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass); #ifdef DEBUG_INDEL_CALL //log_os << std::setprecision(8); //log_os << "INDEL_CALL i,ref_lnp,indel_lnp,lhood(noindel),lhood(hom),lhood(het): " << i << " " << path_lnp.ref << " " << path_lnp.indel << " " << lhood[STAR_DIINDEL::NOINDEL] << " " << lhood[STAR_DIINDEL::HOM] << " " << lhood[STAR_DIINDEL::HET] << "\n"; #endif } if (is_het_bias) { // loop is currently setup to assume a uniform het ratio subgenotype prior const unsigned n_bias_steps(1+static_cast<unsigned>(het_bias/opt.het_bias_max_ratio_inc)); const double ratio_increment(het_bias/static_cast<double>(n_bias_steps)); for (unsigned step(0); step<n_bias_steps; ++step) { const double het_ratio(0.5+(step+1)*ratio_increment); increment_het_ratio_lhood(opt,dopt,sample_opt, indel_error_lnp,indel_real_lnp, ref_error_lnp,ref_real_lnp, ik,id,het_ratio,is_tier2_pass,is_use_alt_indel,lhood); } const unsigned n_het_subgt(1+2*n_bias_steps); const double subgt_log_prior(std::log(static_cast<double>(n_het_subgt))); lhood[STAR_DIINDEL::HET] -= subgt_log_prior; } }
double doc_e_step(const t_document* doc, const double* dirichlet_prior, double* nu, double** digamma_lambda, double* digamma_lambda_sum, const t_setting* setting, const int doc_id, double** rho, double* old_rho) { const int& numtopics = setting->num_topics; const int& doclength = doc->length; const double doctotaloverk = (doc->total) / (double) (numtopics); // Initialize rho, nu for (int i = 0; i < numtopics; ++i) { for (int l = 0; l < doclength; ++l) { rho[l][i] = oneoverk; } nu[i] = dirichlet_prior[i] + doctotaloverk; digamma_nu[doc_id][i] = digamma(nu[i]); } int doc_loop = 0; double doc_likelihood = 0; double doc_likelihood_old = 0; double doc_converged = 1; double nu_sum = 0; double indep_part_likelihood = 0; double dep_part_likelihood = 0; while ((doc_loop < 2) || ((doc_converged > setting->doc_converged) && (doc_loop < setting->doc_max_iter))) { doc_loop += 1; for (int l = 0; l < doclength; ++l) { double rhosum = 0; for (int i = 0; i < numtopics; ++i) { old_rho[i] = rho[l][i]; rho[l][i] = digamma_nu[doc_id][i] + digamma_lambda[i][doc->words[l]] - digamma_lambda_sum[i]; assert(rho[l][i] != 0); assert(!std::isnan(rho[l][i])); if (i > 0) { rhosum = log_sum(rhosum, rho[l][i]); } else { rhosum = rho[l][i]; } assert(!std::isnan(rhosum)); } for (int i = 0; i < numtopics; ++i) { rho[l][i] = exp(rho[l][i] - rhosum); nu[i] = nu[i] + (doc->counts[l]) * (rho[l][i] - old_rho[i]); // !!! a lot of extra digamma's here because of how we're computing it // !!! but its more automatically updated too. digamma_nu[doc_id][i] = digamma(nu[i]); assert(!std::isnan(digamma_nu[doc_id][i])); } } nu_sum = 0; for (int i = 0; i < numtopics; ++i) { nu_sum += nu[i]; } digamma_nu_sum[doc_id] = digamma(nu_sum); indep_part_likelihood = -lgamma(nu_sum); dep_part_likelihood = 0; for (int i = 0; i < numtopics; ++i) { double delta = (digamma_nu[doc_id][i] - digamma_nu_sum[doc_id]); indep_part_likelihood += lgamma(nu[i]) - delta * nu[i]; dep_part_likelihood += delta * dirichlet_prior[i]; for (int l = 0; l < doclength; ++l) { if (rho[l][i] > 0) { indep_part_likelihood += rho[l][i] * (doc->counts[l]) * (delta + digamma_lambda[i][doc->words[l]] - digamma_lambda_sum[i] - log(rho[l][i])); } } } assert(!std::isnan(indep_part_likelihood)); assert(!std::isnan(dep_part_likelihood)); doc_likelihood = indep_part_likelihood + dep_part_likelihood; doc_converged = (doc_likelihood_old - doc_likelihood) / doc_likelihood_old; // if (0 != doc_likelihood_old && doc_likelihood < doc_likelihood_old) { // printf("Warning: doc_likelihood is decreasing. doc_id: %d \t step: %d \t old: %.8f \t new: %.8f \t ratio: %.8f\n", // doc_id, doc_loop, doc_likelihood_old, doc_likelihood, doc_converged); // } assert((doc_loop == 1) || (doc_likelihood >= doc_likelihood_old) || (((doc_likelihood_old - doc_likelihood) / fabs(doc_likelihood_old) < DOC_DECREASE_ALLOWANCE) && (doc_loop >= 4))); doc_likelihood_old = doc_likelihood; } if (doc_loop >= setting->doc_max_iter) { printf("doc loop max reached %d\n", doc_id); exit(-1); } return indep_part_likelihood; }
// calculate probability of strand-specific noise // // accelerated version with no hyrax q-val mods: // // the ratio key can be used as a proxy for the het ratio to look up cached results: // static void get_strand_ratio_lhood_spi(const snp_pos_info& pi, const unsigned ref_gt, const blt_float_t het_ratio, const unsigned het_ratio_index, het_ratio_cache<2>& hrcache, blt_float_t* lhood) { // het_ratio is the expected allele frequency of noise on the // noise-strand, or "on-strand" below. All possible ratio values // (there should be very few), have an associated index value for // caching. // const blt_float_t chet_ratio(1.-het_ratio); const unsigned n_calls(pi.calls.size()); // In this situation every basecall falls into 1 of 4 states: // // 0: off-strand non-reference allele (0) // 1: on-strand non-reference allele (het_ratio) (cached) // 2: on-strand agrees with the reference (chet_ratio) (cached) // 3: off-strand agree with the reference (1) // // The off-strand states are not cached below because they're // simpler to compute // blt_float_t lhood_fwd[DIGT_SGRID::STRAND_SIZE]; // "on-strand" is fwd blt_float_t lhood_rev[DIGT_SGRID::STRAND_SIZE]; // "on-strand" is rev for(unsigned i(0); i<DIGT_SGRID::STRAND_SIZE; ++i) { lhood_fwd[i] = 0; lhood_rev[i] = 0; } static const unsigned n_strand_het_axes(3); for(unsigned i(0); i<n_calls; ++i) { const base_call& bc(pi.calls[i]); std::pair<bool,cache_val<2>*> ret(hrcache.get_val(bc.get_qscore(),het_ratio_index)); cache_val<2>& cv(*ret.second); if(! ret.first) { const blt_float_t eprob(bc.error_prob()); const blt_float_t ceprob(1.-eprob); // cached value [0] refers to state 2 above: on-strand // reference allele cv.val[0]=(std::log((ceprob)*chet_ratio+((eprob)*one_third)*het_ratio)); // cached value [1] refers to state 1 above: on-strand // non-reference allele cv.val[1]=(std::log((ceprob)*het_ratio+((eprob)*one_third)*chet_ratio)); } const uint8_t obs_id(bc.base_id); if(obs_id==ref_gt) { //const double val_onstr(std::log((ceprob)*chet_ratio+((eprob)*one_third)*het_ratio)); const blt_float_t val_off_strand(bc.ln_comp_error_prob()); const blt_float_t val_fwd(bc.is_fwd_strand ? cv.val[0] : val_off_strand); const blt_float_t val_rev(bc.is_fwd_strand ? val_off_strand : cv.val[0]); for(unsigned sgt(0); sgt<n_strand_het_axes; ++sgt) { lhood_fwd[sgt] += val_fwd; lhood_rev[sgt] += val_rev; } } else { //const double val_onstr(std::log((ceprob)*het_ratio+((eprob)*one_third)*chet_ratio)); const blt_float_t val_off_strand(bc.ln_error_prob()+ln_one_third); const blt_float_t val_fwd(bc.is_fwd_strand ? cv.val[1] : val_off_strand); const blt_float_t val_rev(bc.is_fwd_strand ? val_off_strand : cv.val[1]); const unsigned match_strand_state(obs_id>ref_gt ? obs_id-1 : obs_id); for(unsigned sgt(0); sgt<n_strand_het_axes; ++sgt) { if(sgt==match_strand_state) { lhood_fwd[sgt] += val_fwd; lhood_rev[sgt] += val_rev; } else { lhood_fwd[sgt] += val_off_strand; lhood_rev[sgt] += val_off_strand; } } } } for(unsigned i(0); i<DIGT_SGRID::STRAND_SIZE; ++i) { lhood[i] = log_sum(lhood_fwd[i],lhood_rev[i])+ln_one_half; } }
void softmax_df(const gsl_vector * x, void * opt_param, gsl_vector * df) { opt_parameter * gsl_param = (opt_parameter *)opt_param; double PENALTY = gsl_param->PENALTY; slda * model = gsl_param->model; suffstats * ss = gsl_param->ss; gsl_vector_set_zero(df); gsl_vector * df_tmp = gsl_vector_alloc(df->size); double t, a1 = 0.0, a2 = 0.0, g; int k, d, j, l, idx; double * eta_aux = new double [model->num_topics]; for (l = 0; l < model->num_classes-1; l ++) { for (k = 0; k < model->num_topics; k ++) { idx = l*model->num_topics + k; model->eta[l][k] = gsl_vector_get(x, idx); g = -PENALTY * model->eta[l][k]; gsl_vector_set(df, idx, g); } } for (d = 0; d < ss->num_docs; d ++) { for (k = 0; k < model->num_topics; k ++) { l = ss->labels[d]; if (l < model->num_classes-1) { idx = l*model->num_topics + k; g = gsl_vector_get(df, idx) + ss->z_bar[d].z_bar_m[k]; gsl_vector_set(df, idx, g); } } t = 0.0; // in log space, 1+exp()+exp()+.... gsl_vector_memcpy(df_tmp, df); gsl_vector_set_zero(df); for (l = 0; l < model->num_classes-1; l ++) { memset(eta_aux, 0, sizeof(double)*model->num_topics); a1 = 0.0; // \eta_k^T * \bar{\phi}_d a2 = 0.0; // 1 + 0.5*\eta_k^T * Var(z_bar)\eta_k for (k = 0; k < model->num_topics; k ++) { a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k]; for (j = 0; j < model->num_topics; j ++) { idx = map_idx(k, j, model->num_topics); a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j]; eta_aux[k] += ss->z_bar[d].z_bar_var[idx] * model->eta[l][j]; } } a2 = 1.0 + 0.5 * a2; t = log_sum(t, a1 + log(a2)); for (k = 0; k < model->num_topics; k ++) { idx = l*model->num_topics + k; g = gsl_vector_get(df, idx) - exp(a1) * (ss->z_bar[d].z_bar_m[k] * a2 + eta_aux[k]); gsl_vector_set(df, idx, g); } } gsl_vector_scale(df, exp(-t)); gsl_vector_add(df, df_tmp); } gsl_vector_scale(df, -1.0); delete [] eta_aux; gsl_vector_free(df_tmp); }
double doc_inference(vblda_corpus* corpus, vblda_model* model, vblda_ss* ss, int d, int test){ int n, j, variter, w; double c, phisum, temp, cphi; double varlkh, prev_varlkh, conv; prev_varlkh = -1e100; conv = 0.0; variter = 0; do{ varlkh = 0.0; for (n = 0; n < corpus->docs[d].length; n++){ w = corpus->docs[d].words[n]; c = (double) corpus->docs[d].counts[n]; phisum = 0.0; for (j = 0; j < model->m; j++){ ss->oldphi[j] = corpus->docs[d].phi[n][j]; corpus->docs[d].phi[n][j] = gsl_sf_psi(model->gamma[j][d]) + model->psimu[j][w]; if (j > 0) phisum = log_sum(phisum, corpus->docs[d].phi[n][j]); else phisum = corpus->docs[d].phi[n][j]; } for (j = 0; j < model->m; j++){ corpus->docs[d].phi[n][j] = exp(corpus->docs[d].phi[n][j] - phisum); temp = c*(corpus->docs[d].phi[n][j] - ss->oldphi[j]); model->gamma[j][d] += temp; ss->sumgamma[d] += temp; if (corpus->docs[d].phi[n][j] > 0){ cphi = c*corpus->docs[d].phi[n][j]; varlkh += cphi*(model->psimu[j][w]-log(corpus->docs[d].phi[n][j])); } } } varlkh -= lgamma(ss->sumgamma[d]); for (j = 0; j < model->m; j++){ varlkh += lgamma(model->gamma[j][d]); } conv = fabs(prev_varlkh - varlkh)/fabs(prev_varlkh); if (prev_varlkh > varlkh){ printf("ooops doc %d, %lf %lf, %5.10e\n", d, varlkh, prev_varlkh, conv); } prev_varlkh = varlkh; variter ++; }while((variter < MAXITER) && (conv > CONVERGED)); if (test == 0){ for (n = 0; n < corpus->docs[d].length; n++){ w = corpus->docs[d].words[n]; c = (double) corpus->docs[d].counts[n]; for (j = 0; j < model->m; j++){ cphi = corpus->docs[d].phi[n][j]; varlkh -= cphi*model->psimu[j][w]; ss->t[j][w] += cphi; } } } return(varlkh); }
void update_genotype_interval(vector<float> &genotype_interval, vector<float> &interval_cuts, PosteriorInference &local_posterior) { for (unsigned int i_cut = 0; i_cut < genotype_interval.size(); i_cut++) { genotype_interval[i_cut] = log_sum(genotype_interval[i_cut], local_posterior.gq_pair.LogDefiniteIntegral(interval_cuts[i_cut+1], interval_cuts[i_cut]) + local_posterior.params_ll); } }
void update_zeta_i(mxArray *retptr, double *sstopicwordptr, double *ssfeatures, double *zetai, double *windexi, double *wcounti, const mxArray *model, const mxArray *data, const double *Esticks, const double *Eloglambda, const double *Eloggamma, const double *smallphi, const int ndistWords, const int i, double *count2, double *option) { int index, nK1, nK2, t, k, k1, T, K1, K2, V, N, tempind1, tempind2, j, y, C2, Y, phase; double logsum, logsum1, logsum2, minval, val1, val2, val3, value, epsilon, valtemp; double *tmpptr, *tmp1, *log_beta, *dmu, *r, *annotations, *classlabels, *nwordspdoc; mxArray *tmp; minval = mxGetScalar(mxGetField(model,0,"MINVALUE")); phase = mxGetScalar(mxGetField(model,0,"phase")); classlabels = (double*)mxGetPr(mxGetField(data,0,"classlabels")); nwordspdoc = (double*)mxGetPr(mxGetField(data,0,"nwordspdoc")); annotations = (double*)mxGetPr(mxGetField(data,0,"annotations")); r = mxGetPr(mxGetField(model,0,"r")); V = mxGetScalar(mxGetField(model,0,"V")); N = mxGetScalar(mxGetField(model,0,"N")); C2 = mxGetScalar(mxGetField(model,0,"C2")); Y = mxGetScalar(mxGetField(data,0,"Y")); T = mxGetScalar(mxGetField(model,0,"T")); K1 = mxGetScalar(mxGetField(model,0,"K1")); K2 = mxGetScalar(mxGetField(model,0,"K2")); epsilon = mxGetScalar(mxGetField(model,0,"epsilon")); if((int)option[0]==1) { nK1 = (T+K2); nK2 = (K1+K2); } else { nK1 = T; nK2 = K1; } if(phase==1) { // use the dual variable only in training phase; no dual variable in test phase dmu = (double*)mxGetPr(mxGetField(model,0,"dmu")); } tmp = mxCreateDoubleMatrix(ndistWords,nK1,mxREAL); tmpptr = mxGetPr(tmp); for (j=0; j<ndistWords; j++) // loop over (distinct) words { logsum = 0; logsum1 = 0; logsum2 = 0; tmp1 = Malloc(double,nK1); for (k=0; k<nK1; k++) // loop over third dimension { tempind1 = k + ((int)(windexi[j])-1)*nK2; val1 = 0; //////////////////////////////////////////////////////////////////////////////////////////// // terms from document level supervision if(phase==1 && (int)classlabels[i]>=1) // use the dual variable only in training phase only when label is present; no dual variable in test phase; { if(k<T) // for unsupervised topics { for (y=0; y<Y; y++) { valtemp = 0; for (k1=0; k1<K1; k1++) { valtemp = valtemp + (r[k1*Y+ (int)classlabels[i]-1] - r[k1*Y+y])*smallphi[i+k*N+k1*N*T]; } val1 = val1 + dmu[i+y*N]*valtemp; } } else // for NPDSLDA if((int)option[0]==1)// for supervised topics { for (y=0; y<Y; y++) { val1 = val1 + dmu[i+y*N]*(r[(int)classlabels[i]-1 + (k-T+K1)*Y] - r[y + (k-T+K1)*Y]); } val1 = val1/nwordspdoc[i]; } } //////////////////////////////////////////////////////////////////////////////////////////// // for other terms if(k<T) // for unsupersvised topics { val2 = 0; for(k1=0; k1<K1; k1++) { tempind2 = k1 + ((int)(windexi[j])-1)*nK2; val2 = val2 + smallphi[i+k*N+k1*N*T]*Eloglambda[tempind2]; } val3 = Esticks[i+k*N]; *(tmp1+k) = val1 + val2 + val3; } else // for supersvised topics if((int)option[0]==1) // for NPDSLDA { *(tmp1+k) = Eloggamma[i+k*N] + Eloglambda[tempind1] + val1; } if(phase==1) // only in training phase; no need to have any clause for test phase { if((int)option[0]==1) // NPDSLDA { if (k<T) // unsupervised topics logsum1 = log_sum(*(tmp1+k),logsum1); else // supervised topics { if(*(annotations+(k-T)*N+i)==0); //if condition says when to ignore phi's else logsum2 = log_sum(*(tmp1+k),logsum2); } } else // NPLDA logsum = log_sum(*(tmp1+k),logsum); } } // conversion from log space to real number for (k=0; k<nK1; k++) { if((int)option[0]==1) // NPDSLDA { if(k<T) // unsupervised topics { if(logsum1 - *(tmp1+k)>10000) tmpptr[k*ndistWords+j] = minval; if(logsum1 - *(tmp1+k)<10000) tmpptr[k*ndistWords+j] = (1-epsilon)*exp(*(tmp1+k) - logsum1) + minval; } else // supervised topics { if(logsum2 - *(tmp1+k)>10000) tmpptr[k*ndistWords+j] = minval; if(logsum2 - *(tmp1+k)<10000) tmpptr[k*ndistWords+j] = epsilon*exp(*(tmp1+k) - logsum2) + minval; if (phase==1 && k>=T && *(annotations+(k-T)*N+i)==0) // only in training phase { tmpptr[k*ndistWords+j] = 0; // DSLDA //mexPrintf("%d %d %d hey here!\n", i, j, k); } } } else // NPLDA { if(logsum - *(tmp1+k)>10000) tmpptr[k*ndistWords+j] = minval; if(logsum - *(tmp1+k)<10000) tmpptr[k*ndistWords+j] = exp(*(tmp1+k) - logsum) + minval; } } free(tmp1); } // update sufficient statistics -- for both unsupervised and supervised topics for (k = 0; k < nK2; k++) { for (j = 0; j < ndistWords; j++) { value = 0; if(k<K1) // unsupervised topics { for (t = 0; t < T; t++) // loop over topics { //mexPrintf("hey here1! %d %d %d %d %f\n", k, i, ndistWords, t, windexi[j]); value += smallphi[i+t*N+k*N*T]*tmpptr[j+t*ndistWords]; // smallphi{ntk1}*zeta{nmt} } value = wcounti[j]*value; } else // for NPDSLDA if((int)option[0]==1)// supervised topics { value = wcounti[j]*tmpptr[j+(k-K1+T)*ndistWords]; // zeta{nmk2} } index = k + ((int)windexi[j]-1)*nK2; sstopicwordptr[index] += value; ssfeatures[i+k*N] += value; } } mxSetCell (retptr, i, tmp); return; }
void infer_xi(sctm_data* data, sctm_params* params, sctm_latent* latent, sctm_counts* counts) { int d, i, a, k, n; double p1, p2, p11, p22, norm; for (d = 0; d < data->D; d++) { documents* doc = &(data->docs[d]); for (i = 0; i < doc->C; i++) { comment* cmnt = &(doc->cmnts[i]); for (a = 0; a < doc->S; a++) { sentence* sent = &(doc->sents[a]); p1 = 0.; p2 = 0.; for (n = 0; n < cmnt->N; n++) { k = latent->y[d][i][n]; // if (latent->t[d][i][n] == 0) // continue; if (latent->xi[d][i][a] == 1) { p11 = log(counts->m[d][i][k] * 1.0) - log(counts->m_k[d][i]*1.0); // if only article sentence (among selected) in which k topic occurs: if (counts->m_k[d][i] - sent->N == 0) p22 = -INFINITY; else p22 = log((counts->m[d][i][k] - counts->n_jv[d][a][k]) * 1.0) - log((counts->m_k[d][i] - sent->N)*1.0); } else { p11 = log((counts->m[d][i][k] + counts->n_jv[d][a][k]) * 1.0) - log((counts->m_k[d][i] + sent->N)*1.0); // if only article sentence (among selected) in which y topic occurs: if (counts->m_k[d][i] == 0) p22 = -INFINITY; else p22 = log(counts->m[d][i][k] * 1.0) - log(counts->m_k[d][i]*1.0); } p1 += p11; p2 += p22; if (isinf(p2) && !isinf(p1)) break; } p1 += log(params->vr1 * 1.0) - log((params->vr1 + params->vr2)*1.0); p2 += log(params->vr2 * 1.0) - log((params->vr1 + params->vr2)*1.0); norm = 0; if (isinf(p2) && p2 < 0) { p1 = 1.0; p2 = 0.0; } else{ norm = log_sum(p1,p2); p1 -= norm; p1 = exp(p1); } if (isnan(p1) || isinf(p1) || isinf(norm)) { printf("\nd:%d i:%d a:%d p1:%lf p2:%lf\n",d,i,a,p1,p2); debug("incorrect probs: infer_xi"); } if (myrand() <= p1) { if (latent->xi[d][i][a] == 0) { for (k=0; k < params->K; k++) counts->m[d][i][k] += counts->n_jv[d][a][k]; counts->m_k[d][i] += sent->N; } latent->xi[d][i][a] = 1; } else { if (latent->xi[d][i][a] == 1) { for (k=0; k < params->K; k++) counts->m[d][i][k] -= counts->n_jv[d][a][k]; counts->m_k[d][i] -= sent->N; } latent->xi[d][i][a] = 0; } latent->xi_prob[d][i][a] = p1; } //a } //i } //d }
void gmm_compute_p (int n, const float * v, const gmm_t * g, float * p, int flags) { if(n==0) return; /* sgemm doesn't like empty matrices */ long i, j, l; double dtmp; long d=g->d, k=g->k; float * logdetnr = fvec_new(k); for (j = 0 ; j < k ; j++) { logdetnr[j] = -d / 2.0 * log (2 * M_PI); for (i = 0 ; i < d ; i++) logdetnr[j] -= 0.5 * log (g->sigma[j * d + i]); } /* compute all probabilities in log domain */ /* compute squared Mahalanobis distances (result in p) */ if(0) { /* simple & slow */ for (i = 0 ; i < n ; i++) { for (j = 0 ; j < k ; j++) { dtmp = 0; for (l = 0 ; l < d ; l++) { dtmp += sqr (v[i * d + l] - g->mu[j * d + l]) / g->sigma[j * d + l]; } p[i * k + j] = dtmp; } } } else { /* complicated & fast */ compute_mahalanobis_sqr(n,k,d,g->mu,g->sigma,v,p); } /* convert distances to probabilities, staying in the log domain until the very end */ for (i = 0 ; i < n ; i++) { for (j = 0 ; j < k ; j++) { p[i * k + j] = logdetnr[j] - 0.5 * p[i * k + j]; CHECKFINITE(p[i * k + j]); } /* at this point, we have p(x|ci) -> we want p(ci|x) */ if(flags & GMM_FLAGS_NO_NORM) { /* compute the normalization factor */ dtmp=0; } else { dtmp = p[i * k + 0]; if(flags & GMM_FLAGS_W) dtmp+=log(g->w[0]); for (j = 1 ; j < k ; j++) { double log_p=p[i * k + j]; if(flags & GMM_FLAGS_W) log_p+=log(g->w[j]); dtmp = log_sum (dtmp, log_p); } /* now dtmp contains the log of sums */ } for (j = 0 ; j < k ; j++) { double log_norm=0; if(flags & GMM_FLAGS_W) log_norm=log(g->w[j])-dtmp; else log_norm=-dtmp; p[i * k + j] = exp (p[i * k + j] + log_norm); CHECKFINITE(p[i * k + j]); } // printf ("p[%d] = ", i); // fvec_print (p + i * k, k); } free(logdetnr); }
void indel_digt_caller:: get_high_low_het_ratio_lhood(const starling_options& /*opt*/, const starling_deriv_options& dopt, const starling_sample_options& sample_opt, const double indel_error_lnp, const double indel_real_lnp, const double ref_error_lnp, const double ref_real_lnp, const indel_key& ik, const indel_data& id, const double het_ratio, const bool is_tier2_pass, const bool is_use_alt_indel, double& het_lhood_high, double& het_lhood_low) { // handle het ratio and its complement in one step: const double chet_ratio(1.-het_ratio); const double log_het_ratio(std::log(het_ratio)); const double log_chet_ratio(std::log(chet_ratio)); const bool is_breakpoint(ik.is_breakpoint()); het_lhood_high=0; het_lhood_low=0; // typedef read_path_scores::alt_indel_t::const_iterator aiter; typedef indel_data::score_t::const_iterator siter; siter i(id.read_path_lnp.begin()), i_end(id.read_path_lnp.end()); for (; i!=i_end; ++i) { const read_path_scores& path_lnp(i->second); // optionally skip tier2 data: if ((! is_tier2_pass) && (! path_lnp.is_tier1_read)) continue; // get alt path lnp: double alt_path_lnp(path_lnp.ref); #if 0 if (is_use_alt_indel && path_lnp.is_alt && (path_lnp.alt > alt_path_lnp)) { alt_path_lnp=path_lnp.alt; } #else if (is_use_alt_indel && (! path_lnp.alt_indel.empty()) ) { typedef read_path_scores::alt_indel_t::const_iterator aiter; aiter j(path_lnp.alt_indel.begin()), j_end(path_lnp.alt_indel.end()); for (; j!=j_end; ++j) { if (j->second>alt_path_lnp) alt_path_lnp=j->second; } } #endif const double noindel_lnp(log_sum(alt_path_lnp+ref_real_lnp,path_lnp.indel+indel_error_lnp)); const double hom_lnp(log_sum(alt_path_lnp+ref_error_lnp,path_lnp.indel+indel_real_lnp)); // allele ratio convention is that the indel occurs at the // het_allele ratio and the alternate allele occurs at // (1-het_allele_ratio): { double log_ref_prob(log_chet_ratio); double log_indel_prob(log_het_ratio); if (! is_breakpoint) { get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank, ik,het_ratio,log_ref_prob,log_indel_prob); } const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob)); het_lhood_low += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass); } { double log_ref_prob(log_het_ratio); double log_indel_prob(log_chet_ratio); if (! is_breakpoint) { get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank, ik,chet_ratio,log_ref_prob,log_indel_prob); } const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob)); het_lhood_high += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass); } } }
void update_phin(double *temp_phin, const mxArray *phi_n, const double *windexn, const double *wcountn, const mxArray *model, const mxArray *data, const double *psigammaptr, const int n, const int phase, const double *annotations, int option) { int ndistWords, nK, k1, k2, V, N, tempind, i, j, y, C2, Y; // number of words, maximum number of topics, maximum number of observed topics double logsum1, logsum2, minval, val, epsilon; mxArray *tmp; double *tmpptr, *tmp1, *log_beta, *mu, *eta, *classlabels; double *nwordspdoc = mxGetPr(mxGetField(data,0,"nwordspdoc")); minval = mxGetScalar(mxGetField(model,0,"MINVALUE")); ndistWords = mxGetM(phi_n); nK = mxGetN(phi_n); tmp = mxCreateDoubleMatrix(ndistWords,nK,mxREAL); tmpptr = mxGetPr(tmp); log_beta = mxGetPr(mxGetField(model,0,"log_beta")); V = mxGetScalar(mxGetField(model,0,"V")); //mexPrintf("till here ok1\n"); if(option>=3) { classlabels = mxGetPr(mxGetField(data,0,"classlabels")); mu = mxGetPr(mxGetField(model,0,"mu")); eta = mxGetPr(mxGetField(model,0,"eta")); C2 = (int)mxGetScalar(mxGetField(model,0,"C2")); Y = (int)mxGetScalar(mxGetField(model,0,"Y")); if(option>=4) { k1 = mxGetScalar(mxGetField(model,0,"k1")); k2 = mxGetScalar(mxGetField(model,0,"k2")); epsilon = mxGetScalar(mxGetField(model,0,"epsilon")); //mexPrintf("\nY: %d\n",Y); } } int lowlimit, uplimit; if(option==5) // DSLDA-NSLT { lowlimit = k1 + ((int)classlabels[n]-1)*(k2/Y); uplimit = k1 + ((int)classlabels[n])*(k2/Y)-1; //mexPrintf("%d %d %d %d %d %d\n", n, k1, k2, lowlimit, uplimit, (int)classlabels[n]); } //mexPrintf("\t %d acajcjac %d",n, ndistWords); for (i=0; i<ndistWords; i++) { logsum1 = 0; logsum2 = 0; tmp1 = Malloc(double,nK); //mexPrintf("%d %d %d %d till here ok2\n", n, i, windexn[i], ndistWords); for (j=0; j<nK; j++) { tempind = j + ((int)(windexn[i])-1)*nK; //mexPrintf("\n%d %d %d %d %f\n",n, i,j, nK, log_beta[tempind]); val = 0; if(unlclass[n]==1 && (int)classlabels[n]>=1) // use the dual variable in training phase only when label is present; no dual variable in test phase; { for (y=0; y<Y; y++) val = val + mu[y*N+n]*(eta[j*Y+ (int)classlabels[n]-1] - eta[j*Y+y]); val = val/nwordspdoc[n]; } //mexPrintf("\n%d %d %d %d %f\n",n, i,j, nK, log_beta[tempind]); if(option>=3) *(tmp1+j) = psigammaptr[j*N+n] + log_beta[tempind] + val; // access (j,i) th element from gamma if(option==1 || option==2) *(tmp1+j) = psigammaptr[j*N+n] + log_beta[tempind]; // access (j,i) th element from gamma if(option!=1 && option!=3 && ((option>=4 && j<k1) || (option==2 && j<nK))) { // supervised topics for options other than 1(LDA) and 3(MedLDA); for option 2(LLDA), all the topics are supervised, so we need an extra clause if(option==2 && *(unlattr+j*N+n)==0); // LLDA else if (option==4 && j<k1 && *(unlattr+j*N+n)==0); // DSLDA else if (option==5 && ((j<k1 && *(unlattr+j*N+n)==0) || (j>=k1 && !(j>=lowlimit && j<=uplimit)))); // DSLDA-NSLT1 else if (option==7 && (j<k1 && *(unlattr+j*N+n)==0)); // DSLDA-OSST else logsum1 = log_sum(*(tmp1+j),logsum1); } else // unsupervised topics (training and test phases are identical except for DSLDA-NSLT) { if(phase==1 && (int)classlabels[n]>=1 && option==5 && !(j>=lowlimit && j<=uplimit)); // skip in DSLDA-NSLT's training phase; } } // conversion from log space to real number for (j=0; j<nK; j++) { if(option!=1 && option!=3 && ((option>=4 && j<k1) || (option==2 && j<nK))) { // supervised topics for options other than 1(LDA) and 3(MedLDA); for option 2(LLDA), all the topics are supervised, so we need an extra clause //if condition says when to ignore phi's for supervised topics in training phase if(option==2 && *(unlattr+j*N+n)==0) temp_phin[j*ndistWords+i] = 0; // LLDA else if (option==4 && j<k1 && *(unlattr+j*N+n)==0) temp_phin[j*ndistWords+i] = 0; // DSLDA else if (option==5 && j<k1 && *(unlattr+j*N+n)==0) temp_phin[j*ndistWords+i] = 0; // DSLDA-NSLT else if (option==7 && (j<k1 && *(unlattr+j*N+n)==0)) temp_phin[j*ndistWords+i] = 0; // DSLDA-OSST else if(logsum1 - *(tmp1+j)>100) temp_phin[j*ndistWords+i] = minval; //(j,i) th element else if(logsum1 - *(tmp1+j)<100) temp_phin[j*ndistWords+i] = epsilon*exp(*(tmp1+j)-logsum1)+minval; //(j,i) th element else; // do nothing } else // unsupervised topics { if((int)classlabels[n]>=1 && option==5 && !(j>=lowlimit && j<=uplimit)) // DSLDA-NSLT skip in training phase only if the class label is present tmpptr[j*ndistWords+i] = 0; else // no distinction between training and test phase except for DSLDA-NSLT { if(logsum2 - *(tmp1+j)>100) temp_phin[j*ndistWords+i] = minval; //(j,i) th element if(logsum2 - *(tmp1+j)<100) if(option>=4) temp_phin[j*ndistWords+i] = (1-epsilon)*exp(*(tmp1+j)-logsum2)+minval; //(j,i) th element else temp_phin[j*ndistWords+i] = exp(*(tmp1+j)-logsum2)+minval; //(j,i) th element } } } free(tmp1); } return; }