/** Attempt to merge the paths specified in mergeQ with path. * @return the number of paths merged */ static unsigned mergePaths(const Lengths& lengths, ContigPath& path, deque<ContigNode>& mergeQ, set<ContigNode>& seen, const ContigPathMap& paths) { unsigned merged = 0; deque<ContigNode> invalid; for (ContigNode pivot; !mergeQ.empty(); mergeQ.pop_front()) { pivot = mergeQ.front(); ContigPathMap::const_iterator path2It = paths.find(pivot.contigIndex()); if (path2It == paths.end()) continue; ContigPath path2 = path2It->second; if (pivot.sense()) reverseComplement(path2.begin(), path2.end()); ContigPath consensus = align(lengths, path, path2, pivot); if (consensus.empty()) { invalid.push_back(pivot); continue; } appendToMergeQ(mergeQ, seen, path2); path.swap(consensus); if (gDebugPrint) #pragma omp critical(cout) cout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << '\t' << path << '\n'; merged++; } mergeQ.swap(invalid); return merged; }
/** Return the properties of the specified vertex, unless u is * ambiguous, in which case return the length of the ambiguous * sequence. */ static inline ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u) { return u.ambiguous() ? ContigProperties(u.length() + opt::k - 1, 0) : g[u]; }
/** Return a path, complemented if necessary. */ static ContigPath getPath(const Paths& paths, const ContigNode& u) { if (isPath(u)) { unsigned i = u.id() - Vertex::s_offset; return u.sense() ? reverseComplement(paths[i]) : paths[i]; } else return ContigPath(1, u); }
/** Return the specified path. */ static ContigPath getPath(const ContigPathMap& paths, ContigNode u) { ContigPathMap::const_iterator it = paths.find(u.contigIndex()); assert(it != paths.end()); ContigPath path = it->second; if (u.sense()) reverseComplement(path.begin(), path.end()); return path; }
/** Return the sequence of the specified contig node. The sequence * may be ambiguous or reverse complemented. */ static Sequence sequence(const Contigs& contigs, const ContigNode& id) { if (id.ambiguous()) { string s(id.ambiguousSequence()); if (s.length() < opt::k) transform(s.begin(), s.end(), s.begin(), ::tolower); return string(opt::k - 1, 'N') + s; } else { const Sequence& seq = contigs[id.id()].seq; return id.sense() ? reverseComplement(seq) : seq; } }
/** Return the sequence of the specified contig node. The sequence * may be ambiguous or reverse complemented. */ static const Sequence getSequence(ContigNode id) { if (id.ambiguous()) { string s(id.ambiguousSequence()); if (s.length() < opt::k) transform(s.begin(), s.end(), s.begin(), ::tolower); return string(opt::k - 1, 'N') + s; } else { string seq(g_contigs[id.id()]); return id.sense() ? reverseComplement(seq) : seq; } }
static void findOverlap(const Graph& g, ContigID refID, bool rc, const ContigNode& pair, const DistanceEst& est, OverlapGraph& out) { if (refID == pair.id() || (est.distance >= 0 && !opt::scaffold)) return; ContigNode ref(refID, false); const ContigNode& t = rc ? pair : ref; const ContigNode& h = rc ? ref : pair; if (out_degree(t, g) > 0 || in_degree(h, g) > 0 || edge(t, h, out).second) return; bool mask = false; unsigned overlap = est.distance - (int)allowedError(est.stdDev) <= 0 ? findOverlap(g, t, h, mask) : 0; if (mask && !opt::mask) return; if (overlap > 0 || opt::scaffold) add_edge(t, h, Overlap(est, overlap, mask), out); }
static void writeEstimate(ostream& out, const ContigNode& id0, const ContigNode& id1, unsigned len0, unsigned len1, const Pairs& pairs, const PMF& pmf) { if (pairs.size() < opt::npairs) return; DistanceEst est; est.distance = estimateDistance(len0, len1, pairs, pmf, est.numPairs); est.stdDev = pmf.getSampleStdDev(est.numPairs); std::pair<ContigNode, ContigNode> e(id0, id1 ^ id0.sense()); if (est.numPairs >= opt::npairs) { if (opt::format == DOT) { #pragma omp critical(out) out << get(g_contigNames, e) << " [" << est << "]\n"; } else out << ' ' << get(g_contigNames, id1) << ',' << est; } else if (opt::verbose > 1) { #pragma omp critical(cerr) cerr << "warning: " << get(g_contigNames, e) << " [d=" << est.distance << "] " << est.numPairs << " of " << pairs.size() << " pairs fit the expected distribution\n"; } }
/** Add the overlaps of vseq to the graph. */ static void addOverlapsSA(Graph& g, const SuffixArray& sa, ContigNode v, const string& vseq) { assert(!vseq.empty()); set<ContigNode> seen; typedef SuffixArray::const_iterator It; for (string q(vseq, 0, vseq.size() - 1); q.size() >= opt::minOverlap; chop(q)) { pair<It, It> range = sa.equal_range(q); for (It it = range.first; it != range.second; ++it) { ContigNode u(it->second); if (opt::ss && u.sense() != v.sense()) continue; if (seen.insert(u).second) { // Add the longest overlap between two vertices. unsigned overlap = it->first.size(); add_edge(u, v, -overlap, static_cast<DG&>(g)); } } } }
/** Find a path for the specified distance estimates. * @param out [out] the solution path */ static void handleEstimate(const Graph& g, const EstimateRecord& er, bool dirIdx, ContigPath& out) { if (er.estimates[dirIdx].empty()) return; ContigNode origin(er.refID, dirIdx); ostringstream vout_ss; ostream bitBucket(NULL); ostream& vout = opt::verbose > 0 ? vout_ss : bitBucket; vout << "\n* " << get(vertex_name, g, origin) << '\n'; unsigned minNumPairs = UINT_MAX; // generate the reachable set Constraints constraints; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; minNumPairs = min(minNumPairs, ep.numPairs); constraints.push_back(Constraint(v, ep.distance + allowedError(ep.stdDev))); } vout << "Constraints:"; printConstraints(vout, g, constraints) << '\n'; ContigPaths solutions; unsigned numVisited = 0; constrainedSearch(g, origin, constraints, solutions, numVisited); bool tooComplex = numVisited >= opt::maxCost; bool tooManySolutions = solutions.size() > opt::maxPaths; set<ContigID> repeats = findRepeats(er.refID, solutions); if (!repeats.empty()) { vout << "Repeats:"; for (set<ContigID>::const_iterator it = repeats.begin(); it != repeats.end(); ++it) vout << ' ' << get(g_contigNames, *it); vout << '\n'; } unsigned numPossiblePaths = solutions.size(); if (numPossiblePaths > 0) vout << "Paths: " << numPossiblePaths << '\n'; for (ContigPaths::iterator solIter = solutions.begin(); solIter != solutions.end();) { vout << *solIter << '\n'; // Calculate the path distance to each node and see if // it is within the estimated distance. map<ContigNode, int> distanceMap = makeDistanceMap(g, origin, *solIter); // Remove solutions whose distance estimates are not correct. unsigned validCount = 0, invalidCount = 0, ignoredCount = 0; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; vout << get(vertex_name, g, v) << ',' << ep << '\t'; map<ContigNode, int>::iterator dmIter = distanceMap.find(v); if (dmIter == distanceMap.end()) { // This contig is a repeat. ignoredCount++; vout << "ignored\n"; continue; } // translate distance by -overlap to match // coordinate space used by the estimate int actualDistance = dmIter->second; int diff = actualDistance - ep.distance; unsigned buffer = allowedError(ep.stdDev); bool invalid = (unsigned)abs(diff) > buffer; bool repeat = repeats.count(v.contigIndex()) > 0; bool ignored = invalid && repeat; if (ignored) ignoredCount++; else if (invalid) invalidCount++; else validCount++; vout << "dist: " << actualDistance << " diff: " << diff << " buffer: " << buffer << " n: " << ep.numPairs << (ignored ? " ignored" : invalid ? " invalid" : "") << '\n'; } if (invalidCount == 0 && validCount > 0) ++solIter; else solIter = solutions.erase(solIter); } vout << "Solutions: " << solutions.size(); if (tooComplex) vout << " (too complex)"; if (tooManySolutions) vout << " (too many solutions)"; vout << '\n'; ContigPaths::iterator bestSol = solutions.end(); int minDiff = 999999; for (ContigPaths::iterator solIter = solutions.begin(); solIter != solutions.end(); ++solIter) { map<ContigNode, int> distanceMap = makeDistanceMap(g, origin, *solIter); int sumDiff = 0; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; if (repeats.count(v.contigIndex()) > 0) continue; map<ContigNode, int>::iterator dmIter = distanceMap.find(v); assert(dmIter != distanceMap.end()); int actualDistance = dmIter->second; int diff = actualDistance - ep.distance; sumDiff += abs(diff); } if (sumDiff < minDiff) { minDiff = sumDiff; bestSol = solIter; } vout << *solIter << " length: " << calculatePathLength(g, origin, *solIter) << " sumdiff: " << sumDiff << '\n'; } /** Lock the debugging stream. */ static pthread_mutex_t coutMutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&coutMutex); stats.totalAttempted++; g_minNumPairs = min(g_minNumPairs, minNumPairs); if (tooComplex) { stats.tooComplex++; } else if (tooManySolutions) { stats.tooManySolutions++; } else if (numPossiblePaths == 0) { stats.noPossiblePaths++; } else if (solutions.empty()) { stats.noValidPaths++; } else if (repeats.count(er.refID) > 0) { vout << "Repeat: " << get(vertex_name, g, origin) << '\n'; stats.repeat++; } else if (solutions.size() > 1) { ContigPath path = constructAmbiguousPath(g, origin, solutions); if (!path.empty()) { if (opt::extend) extend(g, path.back(), back_inserter(path)); vout << path << '\n'; if (opt::scaffold) { out.insert(out.end(), path.begin(), path.end()); g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs); } } stats.multiEnd++; } else { assert(solutions.size() == 1); assert(bestSol != solutions.end()); ContigPath& path = *bestSol; if (opt::verbose > 1) printDistanceMap(vout, g, origin, path); if (opt::extend) extend(g, path.back(), back_inserter(path)); out.insert(out.end(), path.begin(), path.end()); stats.uniqueEnd++; g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs); } cout << vout_ss.str(); if (!out.empty()) assert(!out.back().ambiguous()); pthread_mutex_unlock(&coutMutex); }
/** Return the sequence of the specified contig. */ static string sequence(const ContigNode& id) { const string& seq = g_contigs[id.id()]; return id.sense() ? reverseComplement(seq) : seq; }
/** Return whether this vertex is a path or a contig. */ static bool isPath(const ContigNode& u) { return u.id() >= Vertex::s_offset; }
/** Identify paths subsumed by the specified path. * @param overlaps [out] paths that are found to overlap * @return the ID of the subsuming path */ static ContigID identifySubsumedPaths(const Lengths& lengths, ContigPathMap::const_iterator path1It, ContigPathMap& paths, set<ContigID>& out, set<ContigID>& overlaps) { ostringstream vout; out.clear(); ContigID id(path1It->first); const ContigPath& path = path1It->second; if (gDebugPrint) vout << get(g_contigNames, ContigNode(id, false)) << '\t' << path << '\n'; for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { ContigNode pivot = *it; if (pivot.ambiguous() || pivot.id() == id) continue; ContigPathMap::iterator path2It = paths.find(pivot.contigIndex()); if (path2It == paths.end()) continue; ContigPath path2 = path2It->second; if (pivot.sense()) reverseComplement(path2.begin(), path2.end()); ContigPath consensus = align(lengths, path, path2, pivot); if (consensus.empty()) continue; if (equalIgnoreAmbiguos(consensus, path)) { if (gDebugPrint) vout << get(g_contigNames, pivot) << '\t' << path2 << '\n'; out.insert(path2It->first); } else if (equalIgnoreAmbiguos(consensus, path2)) { // This path is larger. Use it as the seed. return identifySubsumedPaths(lengths, path2It, paths, out, overlaps); } else if (isCycle(lengths, consensus)) { // The consensus path is a cycle. bool isCyclePath1 = isCycle(lengths, path); bool isCyclePath2 = isCycle(lengths, path2); if (!isCyclePath1 && !isCyclePath2) { // Neither path is a cycle. if (gDebugPrint) vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << "ignored\t" << consensus << '\n'; overlaps.insert(id); overlaps.insert(path2It->first); } else { // At least one path is a cycle. if (gDebugPrint) vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << "cycle\t" << consensus << '\n'; if (isCyclePath1 && isCyclePath2) out.insert(path2It->first); else if (!isCyclePath1) overlaps.insert(id); else if (!isCyclePath2) overlaps.insert(path2It->first); } } else { if (gDebugPrint) vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << "ignored\t" << consensus << '\n'; overlaps.insert(id); overlaps.insert(path2It->first); } } cout << vout.str(); return id; }
/** Return true if the contigs are equal or both are ambiguous. */ static bool equalOrBothAmbiguos(const ContigNode& a, const ContigNode& b) { return a == b || (a.ambiguous() && b.ambiguous()); }
/** Return the length of the specified contig in k-mer. */ static unsigned getLength(const Lengths& lengths, const ContigNode& u) { return u.ambiguous() ? u.length() : lengths.at(u.id()); }