/** Remove ambiguous contigs from the ends of the path. */ static void removeAmbiguousContigs(ContigPath& path) { if (!path.empty() && path.back().ambiguous()) path.erase(path.end() - 1); if (!path.empty() && path.front().ambiguous()) path.erase(path.begin()); }
/** Merge the paths of the specified seed path. * @return the merged contig path */ static ContigPath mergePath(const Lengths& lengths, const ContigPathMap& paths, const ContigPath& seedPath) { assert(!seedPath.empty()); ContigNode seed1 = seedPath.front(); ContigPathMap::const_iterator path1It = paths.find(seed1.contigIndex()); assert(path1It != paths.end()); ContigPath path(path1It->second); if (seedPath.front().sense()) reverseComplement(path.begin(), path.end()); if (opt::verbose > 1) #pragma omp critical(cout) cout << "\n* " << seedPath << '\n' << get(g_contigNames, seedPath.front()) << '\t' << path << '\n'; for (ContigPath::const_iterator it = seedPath.begin() + 1; it != seedPath.end(); ++it) { ContigNode seed2 = *it; ContigPathMap::const_iterator path2It = paths.find(seed2.contigIndex()); assert(path2It != paths.end()); ContigPath path2 = path2It->second; if (seed2.sense()) reverseComplement(path2.begin(), path2.end()); ContigNode pivot = find(path.begin(), path.end(), seed2) != path.end() ? seed2 : seed1; ContigPath consensus = align(lengths, path, path2, pivot); if (consensus.empty()) { // This seed could be removed from the seed path. if (opt::verbose > 1) #pragma omp critical(cout) cout << get(g_contigNames, seed2) << '\t' << path2 << '\n' << "\tinvalid\n"; } else { path.swap(consensus); if (opt::verbose > 1) #pragma omp critical(cout) cout << get(g_contigNames, seed2) << '\t' << path2 << '\n' << '\t' << path << '\n'; } seed1 = seed2; } return path; }
/** Read contig paths from the specified file. * @param g the contig adjacency graph * @param inPath the file of contig paths * @param[out] pathIDs the path IDs * @return the paths */ static Paths readPaths(Graph& g, const string& inPath, vector<string>& pathIDs) { typedef graph_traits<Graph>::vertex_descriptor V; assert(pathIDs.empty()); ifstream fin(inPath.c_str()); if (opt::verbose > 0) cerr << "Reading `" << inPath << "'..." << endl; if (inPath != "-") assert_good(fin, inPath); istream& in = inPath == "-" ? cin : fin; assert_good(in, inPath); Paths paths; string id; ContigPath path; while (in >> id >> path) { if (path.empty()) { // Remove this contig from the graph. V u = find_vertex(id, false, g); clear_vertex(u, g); remove_vertex(u, g); } else { pathIDs.push_back(id); paths.push_back(path); } } assert(in.eof()); return paths; }
/** Attempt to merge the paths specified in mergeQ with path. * @return the number of paths merged */ static unsigned mergePaths(const Lengths& lengths, ContigPath& path, deque<ContigNode>& mergeQ, set<ContigNode>& seen, const ContigPathMap& paths) { unsigned merged = 0; deque<ContigNode> invalid; for (ContigNode pivot; !mergeQ.empty(); mergeQ.pop_front()) { pivot = mergeQ.front(); ContigPathMap::const_iterator path2It = paths.find(pivot.contigIndex()); if (path2It == paths.end()) continue; ContigPath path2 = path2It->second; if (pivot.sense()) reverseComplement(path2.begin(), path2.end()); ContigPath consensus = align(lengths, path, path2, pivot); if (consensus.empty()) { invalid.push_back(pivot); continue; } appendToMergeQ(mergeQ, seen, path2); path.swap(consensus); if (gDebugPrint) #pragma omp critical(cout) cout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << '\t' << path << '\n'; merged++; } mergeQ.swap(invalid); return merged; }
/** Add an edge if the two paths overlap. * @param pivot the pivot at which to seed the alignment * @return whether an overlap was found */ static bool addOverlapEdge(const Lengths& lengths, PathGraph& gout, ContigNode pivot, ContigNode seed1, const ContigPath& path1, ContigNode seed2, const ContigPath& path2) { assert(seed1 != seed2); // Determine the orientation of the overlap edge. dir_type orientation = DIR_X; ContigPath consensus = align(lengths, path1, path2, pivot, orientation); if (consensus.empty()) return false; assert(orientation != DIR_X); if (orientation == DIR_B) { // One of the paths subsumes the other. Use the order of the // seeds to determine the orientation of the edge. orientation = find(consensus.begin(), consensus.end(), seed1) < find(consensus.begin(), consensus.end(), seed2) ? DIR_F : DIR_R; } assert(orientation == DIR_F || orientation == DIR_R); // Add the edge. ContigNode u = orientation == DIR_F ? seed1 : seed2; ContigNode v = orientation == DIR_F ? seed2 : seed1; bool added = false; #pragma omp critical(gout) if (!edge(u, v, gout).second) { add_edge(u, v, gout); added = true; } return added; }
/** Merge a sequence of overlapping paths. */ static ContigPath mergePaths(const Paths& paths, const OverlapMap& overlaps, const ContigPath& merge) { assert(!merge.empty()); ContigNode u = merge.front(); ContigPath path(getPath(paths, u)); for (ContigPath::const_iterator it = merge.begin() + 1; it != merge.end(); ++it) { ContigNode v = *it; ContigPath vpath(getPath(paths, v)); unsigned overlap = getOverlap(overlaps, u, v); assert(path.size() > overlap); assert(vpath.size() > overlap); assert(equal(path.end() - overlap, path.end(), vpath.begin())); path.insert(path.end(), vpath.begin() + overlap, vpath.end()); u = v; } return path; }
/** Find a path for the specified distance estimates. * @param out [out] the solution path */ static void handleEstimate(const Graph& g, const EstimateRecord& er, bool dirIdx, ContigPath& out) { if (er.estimates[dirIdx].empty()) return; ContigNode origin(er.refID, dirIdx); ostringstream vout_ss; ostream bitBucket(NULL); ostream& vout = opt::verbose > 0 ? vout_ss : bitBucket; vout << "\n* " << get(vertex_name, g, origin) << '\n'; unsigned minNumPairs = UINT_MAX; // generate the reachable set Constraints constraints; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; minNumPairs = min(minNumPairs, ep.numPairs); constraints.push_back(Constraint(v, ep.distance + allowedError(ep.stdDev))); } vout << "Constraints:"; printConstraints(vout, g, constraints) << '\n'; ContigPaths solutions; unsigned numVisited = 0; constrainedSearch(g, origin, constraints, solutions, numVisited); bool tooComplex = numVisited >= opt::maxCost; bool tooManySolutions = solutions.size() > opt::maxPaths; set<ContigID> repeats = findRepeats(er.refID, solutions); if (!repeats.empty()) { vout << "Repeats:"; for (set<ContigID>::const_iterator it = repeats.begin(); it != repeats.end(); ++it) vout << ' ' << get(g_contigNames, *it); vout << '\n'; } unsigned numPossiblePaths = solutions.size(); if (numPossiblePaths > 0) vout << "Paths: " << numPossiblePaths << '\n'; for (ContigPaths::iterator solIter = solutions.begin(); solIter != solutions.end();) { vout << *solIter << '\n'; // Calculate the path distance to each node and see if // it is within the estimated distance. map<ContigNode, int> distanceMap = makeDistanceMap(g, origin, *solIter); // Remove solutions whose distance estimates are not correct. unsigned validCount = 0, invalidCount = 0, ignoredCount = 0; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; vout << get(vertex_name, g, v) << ',' << ep << '\t'; map<ContigNode, int>::iterator dmIter = distanceMap.find(v); if (dmIter == distanceMap.end()) { // This contig is a repeat. ignoredCount++; vout << "ignored\n"; continue; } // translate distance by -overlap to match // coordinate space used by the estimate int actualDistance = dmIter->second; int diff = actualDistance - ep.distance; unsigned buffer = allowedError(ep.stdDev); bool invalid = (unsigned)abs(diff) > buffer; bool repeat = repeats.count(v.contigIndex()) > 0; bool ignored = invalid && repeat; if (ignored) ignoredCount++; else if (invalid) invalidCount++; else validCount++; vout << "dist: " << actualDistance << " diff: " << diff << " buffer: " << buffer << " n: " << ep.numPairs << (ignored ? " ignored" : invalid ? " invalid" : "") << '\n'; } if (invalidCount == 0 && validCount > 0) ++solIter; else solIter = solutions.erase(solIter); } vout << "Solutions: " << solutions.size(); if (tooComplex) vout << " (too complex)"; if (tooManySolutions) vout << " (too many solutions)"; vout << '\n'; ContigPaths::iterator bestSol = solutions.end(); int minDiff = 999999; for (ContigPaths::iterator solIter = solutions.begin(); solIter != solutions.end(); ++solIter) { map<ContigNode, int> distanceMap = makeDistanceMap(g, origin, *solIter); int sumDiff = 0; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; if (repeats.count(v.contigIndex()) > 0) continue; map<ContigNode, int>::iterator dmIter = distanceMap.find(v); assert(dmIter != distanceMap.end()); int actualDistance = dmIter->second; int diff = actualDistance - ep.distance; sumDiff += abs(diff); } if (sumDiff < minDiff) { minDiff = sumDiff; bestSol = solIter; } vout << *solIter << " length: " << calculatePathLength(g, origin, *solIter) << " sumdiff: " << sumDiff << '\n'; } /** Lock the debugging stream. */ static pthread_mutex_t coutMutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&coutMutex); stats.totalAttempted++; g_minNumPairs = min(g_minNumPairs, minNumPairs); if (tooComplex) { stats.tooComplex++; } else if (tooManySolutions) { stats.tooManySolutions++; } else if (numPossiblePaths == 0) { stats.noPossiblePaths++; } else if (solutions.empty()) { stats.noValidPaths++; } else if (repeats.count(er.refID) > 0) { vout << "Repeat: " << get(vertex_name, g, origin) << '\n'; stats.repeat++; } else if (solutions.size() > 1) { ContigPath path = constructAmbiguousPath(g, origin, solutions); if (!path.empty()) { if (opt::extend) extend(g, path.back(), back_inserter(path)); vout << path << '\n'; if (opt::scaffold) { out.insert(out.end(), path.begin(), path.end()); g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs); } } stats.multiEnd++; } else { assert(solutions.size() == 1); assert(bestSol != solutions.end()); ContigPath& path = *bestSol; if (opt::verbose > 1) printDistanceMap(vout, g, origin, path); if (opt::extend) extend(g, path.back(), back_inserter(path)); out.insert(out.end(), path.begin(), path.end()); stats.uniqueEnd++; g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs); } cout << vout_ss.str(); if (!out.empty()) assert(!out.back().ambiguous()); pthread_mutex_unlock(&coutMutex); }
/** Identify paths subsumed by the specified path. * @param overlaps [out] paths that are found to overlap * @return the ID of the subsuming path */ static ContigID identifySubsumedPaths(const Lengths& lengths, ContigPathMap::const_iterator path1It, ContigPathMap& paths, set<ContigID>& out, set<ContigID>& overlaps) { ostringstream vout; out.clear(); ContigID id(path1It->first); const ContigPath& path = path1It->second; if (gDebugPrint) vout << get(g_contigNames, ContigNode(id, false)) << '\t' << path << '\n'; for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { ContigNode pivot = *it; if (pivot.ambiguous() || pivot.id() == id) continue; ContigPathMap::iterator path2It = paths.find(pivot.contigIndex()); if (path2It == paths.end()) continue; ContigPath path2 = path2It->second; if (pivot.sense()) reverseComplement(path2.begin(), path2.end()); ContigPath consensus = align(lengths, path, path2, pivot); if (consensus.empty()) continue; if (equalIgnoreAmbiguos(consensus, path)) { if (gDebugPrint) vout << get(g_contigNames, pivot) << '\t' << path2 << '\n'; out.insert(path2It->first); } else if (equalIgnoreAmbiguos(consensus, path2)) { // This path is larger. Use it as the seed. return identifySubsumedPaths(lengths, path2It, paths, out, overlaps); } else if (isCycle(lengths, consensus)) { // The consensus path is a cycle. bool isCyclePath1 = isCycle(lengths, path); bool isCyclePath2 = isCycle(lengths, path2); if (!isCyclePath1 && !isCyclePath2) { // Neither path is a cycle. if (gDebugPrint) vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << "ignored\t" << consensus << '\n'; overlaps.insert(id); overlaps.insert(path2It->first); } else { // At least one path is a cycle. if (gDebugPrint) vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << "cycle\t" << consensus << '\n'; if (isCyclePath1 && isCyclePath2) out.insert(path2It->first); else if (!isCyclePath1) overlaps.insert(id); else if (!isCyclePath2) overlaps.insert(path2It->first); } } else { if (gDebugPrint) vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << "ignored\t" << consensus << '\n'; overlaps.insert(id); overlaps.insert(path2It->first); } } cout << vout.str(); return id; }
/** Find an equivalent region of the two specified paths. * @param[out] orientation the orientation of the alignment * @return the consensus sequence */ static ContigPath align(const Lengths& lengths, const ContigPath& path1, const ContigPath& path2, ContigNode pivot, dir_type& orientation) { if (&path1 == &path2) { // Ignore the trivial alignment when aligning a path to // itself. } else if (path1 == path2) { // These two paths are identical. orientation = DIR_B; return path1; } else { ContigPath::const_iterator it = search(path1.begin(), path1.end(), path2.begin(), path2.end()); if (it != path1.end()) { // path2 is subsumed in path1. // Determine the orientation of the edge. orientation = it == path1.begin() ? DIR_R : it + path2.size() == path1.end() ? DIR_F : DIR_B; return path1; } } // Find a suitable pivot. if (find(path1.begin(), path1.end(), pivot) == path1.end() || find(path2.begin(), path2.end(), pivot) == path2.end()) { bool good; tie(pivot, good) = findPivot(path1, path2); if (!good) return ContigPath(); } assert(find(path1.begin(), path1.end(), pivot) != path1.end()); ContigPath::const_iterator it2 = find(path2.begin(), path2.end(), pivot); assert(it2 != path2.end()); if (&path1 != &path2) { // The seed must be unique in path2, unless we're aligning a // path to itself. assert(count(it2+1, path2.end(), pivot) == 0); } ContigPath consensus; for (ContigPath::const_iterator it1 = find_if( path1.begin(), path1.end(), bind2nd(equal_to<ContigNode>(), pivot)); it1 != path1.end(); it1 = find_if(it1+1, path1.end(), bind2nd(equal_to<ContigNode>(), pivot))) { if (&*it1 == &*it2) { // We are aligning a path to itself, and this is the // trivial alignment, which we'll ignore. continue; } consensus = align(lengths, path1, path2, it1, it2, orientation); if (!consensus.empty()) return consensus; } return consensus; }
/** Return the consensus sequence of the specified gap. */ static ContigPath fillGap(const Graph& g, const AmbPathConstraint& apConstraint, vector<bool>& seen, ofstream& outFasta) { if (opt::verbose > 1) cerr << "\n* " << get(vertex_name, g, apConstraint.source) << ' ' << apConstraint.dist << "N " << get(vertex_name, g, apConstraint.dest) << '\n'; Constraints constraints; constraints.push_back(Constraint(apConstraint.dest, apConstraint.dist + opt::distanceError)); ContigPaths solutions; unsigned numVisited = 0; constrainedSearch(g, apConstraint.source, constraints, solutions, numVisited); bool tooComplex = numVisited >= opt::maxCost; for (ContigPaths::iterator solIt = solutions.begin(); solIt != solutions.end(); solIt++) solIt->insert(solIt->begin(), apConstraint.source); ContigPath consensus; bool tooManySolutions = solutions.size() > opt::numBranches; if (tooComplex) { stats.tooComplex++; if (opt::verbose > 1) cerr << solutions.size() << " paths (too complex)\n"; } else if (tooManySolutions) { stats.numTooManySolutions++; if (opt::verbose > 1) cerr << solutions.size() << " paths (too many)\n"; } else if (solutions.empty()) { stats.numNoSolutions++; if (opt::verbose > 1) cerr << "no paths\n"; } else if (solutions.size() == 1) { if (opt::verbose > 1) cerr << "1 path\n" << solutions.front() << '\n'; stats.numMerged++; } else { assert(solutions.size() > 1); if (opt::verbose > 2) copy(solutions.begin(), solutions.end(), ostream_iterator<ContigPath>(cerr, "\n")); else if (opt::verbose > 1) cerr << solutions.size() << " paths\n"; consensus = align(g, solutions, outFasta); if (!consensus.empty()) { stats.numMerged++; // Mark contigs that are used in a consensus. markSeen(seen, solutions, true); if (opt::verbose > 1) cerr << consensus << '\n'; } else stats.notMerged++; } return consensus; }