/** Return the most likely distance between two contigs and the number * of pairs that support that estimate. */ static pair<int, unsigned> maximumLikelihoodEstimate(int first, int last, const Histogram& samples, const PMF& pmf, unsigned len0, unsigned len1) { int filterSize = 2 * (int)(0.05 * pmf.mean()) + 3; // want an odd filter size first = max(first, (int)pmf.minValue() - samples.maximum()) - filterSize/2; last = min(last, (int)pmf.maxValue() - samples.minimum()) + filterSize/2 + 1; /* When randomly selecting fragments that span a given point, * longer fragments are more likely to be selected than * shorter fragments. */ WindowFunction window(len0, len1); unsigned nsamples = samples.size(); double bestLikelihood = -numeric_limits<double>::max(); int bestTheta = first; unsigned bestn = 0; vector<double> le; vector<unsigned> le_n; vector<int> le_theta; for (int theta = first; theta <= last; theta++) { // Calculate the normalizing constant of the PMF, f_theta(x). double c = 0; for (int i = pmf.minValue(); i <= (int)pmf.maxValue(); ++i) c += pmf[i] * window(i - theta); double likelihood; unsigned n; tie(likelihood, n) = computeLikelihood(theta, samples, pmf); likelihood -= nsamples * log(c); le.push_back(likelihood); le_n.push_back(n); le_theta.push_back(theta); } HannWindow filter(filterSize); for (int i = filterSize / 2; i < (int)le.size()-(filterSize / 2); i++) { double likelihood = 0; for (int j = -filterSize / 2; j <= filterSize / 2; j++) { assert((unsigned)(i + j) < le.size() && i + j >= 0); likelihood += filter(j) * le[i + j]; } if (le_n[i] > 0 && likelihood > bestLikelihood) { bestLikelihood = likelihood; bestTheta = le_theta[i]; bestn = le_n[i]; } } return make_pair(bestTheta, bestn); }
/** Estimate the distance between two contigs using the difference of * the population mean and the sample mean. * @param numPairs [out] the number of pairs that agree with the * expected distribution * @return the estimated distance */ static int estimateDistanceUsingMean( const std::vector<int>& samples, const PMF& pmf, unsigned& numPairs) { Histogram h(samples.begin(), samples.end()); int d = (int)round(pmf.mean() - h.mean()); // Count the number of samples that agree with the distribution. unsigned n = 0; for (Histogram::const_iterator it = h.begin(); it != h.end(); ++it) if (pmf[it->first + d] > pmf.minProbability()) n += it->second; numPairs = n; return d; }