static void writeEstimate(ostream& out, const ContigNode& id0, const ContigNode& id1, unsigned len0, unsigned len1, const Pairs& pairs, const PMF& pmf) { if (pairs.size() < opt::npairs) return; DistanceEst est; est.distance = estimateDistance(len0, len1, pairs, pmf, est.numPairs); est.stdDev = pmf.getSampleStdDev(est.numPairs); std::pair<ContigNode, ContigNode> e(id0, id1 ^ id0.sense()); if (est.numPairs >= opt::npairs) { if (opt::format == DOT) { #pragma omp critical(out) out << get(g_contigNames, e) << " [" << est << "]\n"; } else out << ' ' << get(g_contigNames, id1) << ',' << est; } else if (opt::verbose > 1) { #pragma omp critical(cerr) cerr << "warning: " << get(g_contigNames, e) << " [d=" << est.distance << "] " << est.numPairs << " of " << pairs.size() << " pairs fit the expected distribution\n"; } }
// term of non-HW HMatrix inline double term_ij( double Bp, double Bq, const PMF<Allele> &p, const PMF<Allele> &q, const BackFreq &hback, Allele const &i, Allele const &j) { // NB in this term (i, j) is ordered double ret = p.val(i) * q.val(j); if (Bq>0) ret += Bq * p.val(i) * hback(j, i); if (Bp>0) ret += Bp * q.val(j) * hback(i, j); if (Bp*Bq>0) ret += Bp * Bq * hback.pOrdered(make_pair(i,j)); return ret; }
/** Estimate the distance between two contigs using the difference of * the population mean and the sample mean. * @param numPairs [out] the number of pairs that agree with the * expected distribution * @return the estimated distance */ static int estimateDistanceUsingMean( const std::vector<int>& samples, const PMF& pmf, unsigned& numPairs) { Histogram h(samples.begin(), samples.end()); int d = (int)round(pmf.mean() - h.mean()); // Count the number of samples that agree with the distribution. unsigned n = 0; for (Histogram::const_iterator it = h.begin(); it != h.end(); ++it) if (pmf[it->first + d] > pmf.minProbability()) n += it->second; numPairs = n; return d; }
// This gives non-HW treatment of the 'F' term HMatrix makeHMatrixNHW( const PMF<Allele> &p, const PMF<Allele> &q, const BackFreq &hback, float delta, bool sparse) { // How much background to add in? // If the input HMatrix is not normalized then we make up the difference // with background. Otherwise we add in delta. double Bp = std::max((double)delta, 1 - p.sum()); // background for p double Bq = std::max((double)delta, 1 - q.sum()); // background for q // apply this formula: // H(ij) = p(i)q(j) + Bq p(i)b(j|i) + Bp (q(j)b(i|j) + Bp Bq Bij HMatrix ret; // loop over all elements in the background. This gives us the upper // triangular terms only. We need to sum over all terms. HMatrix &href = (HMatrix&)hback; // to use base class member PMF< std::pair<Allele, Allele> >::iterator it; for (it = href.m_pmf.begin(); it != href.m_pmf.end(); ++it) { Allele i = it->first.first; Allele j = it->first.second; double h_ij = term_ij(Bp, Bq, p, q, hback, i, j); // upper-triangular term if (i != j) { h_ij += term_ij(Bp, Bq, p, q, hback, j, i); // lower-triangular term } if (!sparse || h_ij > 0) ret.set(i, j, h_ij); } ret.normalize(); // just in case return ret; }
// check for alleles not in population database bool AlleleSet::checkBackground(PMF<Allele> const &background) const { bool ret = true; std::vector< PMF<Allele> >::const_iterator ip; for(ip = m_pmfs.begin(); ip != m_pmfs.end(); ++ip) { PMF<Allele>::const_iterator ia; for(ia = ip->begin(); ia != ip->end(); ++ia) { if (background.find(ia->first) == background.end()) { // Allele not in database. warn << startl << "allele not in population database: " << ia->first.string() << std::endl; ret = false; } } } return ret; }
/** Compute the log likelihood that these samples came from the * specified distribution shifted by the parameter theta. * @param theta the parameter of the PMF, f_theta(x) * @param samples the samples * @param pmf the probability mass function * @return the log likelihood */ static pair<double, unsigned> computeLikelihood(int theta, const Histogram& samples, const PMF& pmf) { double likelihood = 0; unsigned nsamples = 0; for (Histogram::const_iterator it = samples.begin(); it != samples.end(); ++it) { double p = pmf[it->first + theta]; unsigned n = it->second; likelihood += n * log(p); if (p > pmf.minProbability()) nsamples += n; } return make_pair(likelihood, nsamples); }