/** Read contig paths from the specified file. * @param ids [out] the string ID of the paths */ static ContigPaths readPaths(const string& inPath, vector<string>* ids = NULL) { if (ids != NULL) assert(ids->empty()); ifstream fin(inPath.c_str()); if (opt::verbose > 0) cerr << "Reading `" << inPath << "'..." << endl; if (inPath != "-") assert_good(fin, inPath); istream& in = inPath == "-" ? cin : fin; unsigned count = 0; ContigPaths paths; string id; ContigPath path; while (in >> id >> path) { paths.push_back(path); if (ids != NULL) ids->push_back(id); ++count; if (opt::verbose > 1 && count % 1000000 == 0) cerr << "Read " << count << " paths. " "Using " << toSI(getMemoryUsage()) << "B of memory.\n"; } if (opt::verbose > 0) cerr << "Read " << count << " paths. " "Using " << toSI(getMemoryUsage()) << "B of memory.\n"; assert(in.eof()); return paths; }
/** Finds all contigs used in each path in paths, and * marks them as seen in the vector seen. */ static void seenContigs(vector<bool>& seen, const ContigPaths& paths) { for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) for (ContigPath::const_iterator itc = it->begin(); itc != it->end(); ++itc) if (itc->id() < seen.size()) seen[itc->id()] = true; }
/** Mark contigs for removal. An empty path indicates that a contig * should be removed. */ static void markRemovedContigs(vector<bool>& marked, const vector<string>& pathIDs, const ContigPaths& paths) { for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { if (it->empty()) { size_t i = get(g_contigNames, pathIDs[it - paths.begin()]); assert(i < marked.size()); marked[i] = true; } } }
/** Return the set of contigs that appear more than once in a single * solution. */ static set<ContigID> findRepeats(ContigID seed, const ContigPaths& solutions) { set<ContigID> repeats; for (ContigPaths::const_iterator solIt = solutions.begin(); solIt != solutions.end(); ++solIt) { map<ContigID, unsigned> count; count[seed]++; for (ContigPath::const_iterator it = solIt->begin(); it != solIt->end(); ++it) count[it->contigIndex()]++; for (map<ContigID, unsigned>::const_iterator it = count.begin(); it != count.end(); ++it) if (it->second > 1) repeats.insert(it->first); } return repeats; }
/** Read contig paths from the specified file. * @param[in] inPath the filename of the contig paths * @param[out] ids the string ID of the paths * @param[out] isAmb whether the path contains a gap */ static ContigPaths readPaths(const string& inPath, vector<string>& ids, vector<bool>& isAmb) { typedef graph_traits<Graph>::vertex_descriptor V; assert(ids.empty()); assert(isAmb.empty()); assert(g_ambpath_contig.empty()); ifstream fin(inPath.c_str()); if (opt::verbose > 0) cerr << "Reading `" << inPath << "'..." << endl; if (inPath != "-") assert_good(fin, inPath); istream& in = inPath == "-" ? cin : fin; ContigPaths paths; string id; Path path; while (in >> id >> path) { paths.push_back(path); ids.push_back(id); isAmb.push_back(false); if (path.size() <= 2) continue; for (Path::iterator it = path.begin() + 2; it != path.end(); ++it) { ContigPath::value_type t = it[-2], u = it[-1], v = it[0]; if (u.ambiguous()) { assert(!t.ambiguous()); assert(!v.ambiguous()); g_ambpath_contig.insert(AmbPath2Contig::value_type( AmbPathConstraint(t, v, u.length()), ContigPath())); isAmb.back() = true; } } } assert(in.eof()); return paths; }
/** Assemble the path overlap graph. */ static void assemblePathGraph(const Lengths& lengths, PathGraph& pathGraph, ContigPathMap& paths) { ContigPaths seedPaths; assembleDFS(pathGraph, back_inserter(seedPaths)); ContigPaths mergedPaths = mergeSeedPaths(lengths, paths, seedPaths); if (opt::verbose > 1) cout << '\n'; // Replace each path with the merged path. for (ContigPaths::const_iterator it1 = seedPaths.begin(); it1 != seedPaths.end(); ++it1) { const ContigPath& path(mergedPaths[it1 - seedPaths.begin()]); ContigPath pathrc(path); reverseComplement(pathrc.begin(), pathrc.end()); for (ContigPath::const_iterator it2 = it1->begin(); it2 != it1->end(); ++it2) { ContigNode seed(*it2); if (find(path.begin(), path.end(), seed) != path.end()) { paths[seed.contigIndex()] = seed.sense() ? pathrc : path; } else { // This seed was not included in the merged path. } } } removeRepeats(paths); // Remove the subsumed paths. if (opt::verbose > 0) cout << "Removing redundant contigs\n"; removeSubsumedPaths(lengths, paths); outputSortedPaths(paths); }
/** Merge the specified seed paths. * @return the merged contig paths */ static ContigPaths mergeSeedPaths(const Lengths& lengths, const ContigPathMap& paths, const ContigPaths& seedPaths) { if (opt::verbose > 0) cout << "\nMerging paths\n"; ContigPaths out; out.reserve(seedPaths.size()); for (ContigPaths::const_iterator it = seedPaths.begin(); it != seedPaths.end(); ++it) out.push_back(mergePath(lengths, paths, *it)); return out; }
/** Output the updated overlap graph. */ static void outputGraph(Graph& g, const vector<string>& pathIDs, const ContigPaths& paths, const string& commandLine) { typedef graph_traits<Graph>::vertex_descriptor V; // Add the path vertices. g_contigNames.unlock(); for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; const string& id = pathIDs[it - paths.begin()]; if (!path.empty()) { V u = merge(g, path.begin(), path.end()); put(vertex_name, g, u, id); } } g_contigNames.lock(); // Remove the vertices that are used in paths. for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; const string& id = pathIDs[it - paths.begin()]; if (path.empty()) { remove_vertex(find_vertex(id, false, g), g); } else { remove_vertex_if(g, path.begin(), path.end(), not1(std::mem_fun_ref(&ContigNode::ambiguous))); } } // Output the graph. const string& graphPath = opt::graphPath; assert(!graphPath.empty()); if (opt::verbose > 0) cerr << "Writing `" << graphPath << "'..." << endl; ofstream fout(graphPath.c_str()); assert_good(fout, graphPath); write_graph(fout, g, PROGRAM, commandLine); assert_good(fout, graphPath); if (opt::verbose > 0) printGraphStats(cerr, g); }
/** Find a path for the specified distance estimates. * @param out [out] the solution path */ static void handleEstimate(const Graph& g, const EstimateRecord& er, bool dirIdx, ContigPath& out) { if (er.estimates[dirIdx].empty()) return; ContigNode origin(er.refID, dirIdx); ostringstream vout_ss; ostream bitBucket(NULL); ostream& vout = opt::verbose > 0 ? vout_ss : bitBucket; vout << "\n* " << get(vertex_name, g, origin) << '\n'; unsigned minNumPairs = UINT_MAX; // generate the reachable set Constraints constraints; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; minNumPairs = min(minNumPairs, ep.numPairs); constraints.push_back(Constraint(v, ep.distance + allowedError(ep.stdDev))); } vout << "Constraints:"; printConstraints(vout, g, constraints) << '\n'; ContigPaths solutions; unsigned numVisited = 0; constrainedSearch(g, origin, constraints, solutions, numVisited); bool tooComplex = numVisited >= opt::maxCost; bool tooManySolutions = solutions.size() > opt::maxPaths; set<ContigID> repeats = findRepeats(er.refID, solutions); if (!repeats.empty()) { vout << "Repeats:"; for (set<ContigID>::const_iterator it = repeats.begin(); it != repeats.end(); ++it) vout << ' ' << get(g_contigNames, *it); vout << '\n'; } unsigned numPossiblePaths = solutions.size(); if (numPossiblePaths > 0) vout << "Paths: " << numPossiblePaths << '\n'; for (ContigPaths::iterator solIter = solutions.begin(); solIter != solutions.end();) { vout << *solIter << '\n'; // Calculate the path distance to each node and see if // it is within the estimated distance. map<ContigNode, int> distanceMap = makeDistanceMap(g, origin, *solIter); // Remove solutions whose distance estimates are not correct. unsigned validCount = 0, invalidCount = 0, ignoredCount = 0; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; vout << get(vertex_name, g, v) << ',' << ep << '\t'; map<ContigNode, int>::iterator dmIter = distanceMap.find(v); if (dmIter == distanceMap.end()) { // This contig is a repeat. ignoredCount++; vout << "ignored\n"; continue; } // translate distance by -overlap to match // coordinate space used by the estimate int actualDistance = dmIter->second; int diff = actualDistance - ep.distance; unsigned buffer = allowedError(ep.stdDev); bool invalid = (unsigned)abs(diff) > buffer; bool repeat = repeats.count(v.contigIndex()) > 0; bool ignored = invalid && repeat; if (ignored) ignoredCount++; else if (invalid) invalidCount++; else validCount++; vout << "dist: " << actualDistance << " diff: " << diff << " buffer: " << buffer << " n: " << ep.numPairs << (ignored ? " ignored" : invalid ? " invalid" : "") << '\n'; } if (invalidCount == 0 && validCount > 0) ++solIter; else solIter = solutions.erase(solIter); } vout << "Solutions: " << solutions.size(); if (tooComplex) vout << " (too complex)"; if (tooManySolutions) vout << " (too many solutions)"; vout << '\n'; ContigPaths::iterator bestSol = solutions.end(); int minDiff = 999999; for (ContigPaths::iterator solIter = solutions.begin(); solIter != solutions.end(); ++solIter) { map<ContigNode, int> distanceMap = makeDistanceMap(g, origin, *solIter); int sumDiff = 0; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; if (repeats.count(v.contigIndex()) > 0) continue; map<ContigNode, int>::iterator dmIter = distanceMap.find(v); assert(dmIter != distanceMap.end()); int actualDistance = dmIter->second; int diff = actualDistance - ep.distance; sumDiff += abs(diff); } if (sumDiff < minDiff) { minDiff = sumDiff; bestSol = solIter; } vout << *solIter << " length: " << calculatePathLength(g, origin, *solIter) << " sumdiff: " << sumDiff << '\n'; } /** Lock the debugging stream. */ static pthread_mutex_t coutMutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&coutMutex); stats.totalAttempted++; g_minNumPairs = min(g_minNumPairs, minNumPairs); if (tooComplex) { stats.tooComplex++; } else if (tooManySolutions) { stats.tooManySolutions++; } else if (numPossiblePaths == 0) { stats.noPossiblePaths++; } else if (solutions.empty()) { stats.noValidPaths++; } else if (repeats.count(er.refID) > 0) { vout << "Repeat: " << get(vertex_name, g, origin) << '\n'; stats.repeat++; } else if (solutions.size() > 1) { ContigPath path = constructAmbiguousPath(g, origin, solutions); if (!path.empty()) { if (opt::extend) extend(g, path.back(), back_inserter(path)); vout << path << '\n'; if (opt::scaffold) { out.insert(out.end(), path.begin(), path.end()); g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs); } } stats.multiEnd++; } else { assert(solutions.size() == 1); assert(bestSol != solutions.end()); ContigPath& path = *bestSol; if (opt::verbose > 1) printDistanceMap(vout, g, origin, path); if (opt::extend) extend(g, path.back(), back_inserter(path)); out.insert(out.end(), path.begin(), path.end()); stats.uniqueEnd++; g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs); } cout << vout_ss.str(); if (!out.empty()) assert(!out.back().ambiguous()); pthread_mutex_unlock(&coutMutex); }
/** Return an ambiguous path that agrees with all the given paths. */ static ContigPath constructAmbiguousPath(const Graph &g, const ContigNode& origin, const ContigPaths& paths) { assert(!paths.empty()); // Find the size of the smallest path. const ContigPath& firstSol = paths.front(); size_t min_len = firstSol.size(); for (ContigPaths::const_iterator it = paths.begin() + 1; it != paths.end(); ++it) min_len = min(min_len, it->size()); // Find the longest prefix. ContigPath vppath; size_t longestPrefix; bool commonPrefix = true; for (longestPrefix = 0; longestPrefix < min_len; longestPrefix++) { const ContigNode& common_path_node = firstSol[longestPrefix]; for (ContigPaths::const_iterator solIter = paths.begin(); solIter != paths.end(); ++solIter) { const ContigNode& pathnode = (*solIter)[longestPrefix]; if (pathnode != common_path_node) { // Found the longest prefix. commonPrefix = false; break; } } if (!commonPrefix) break; vppath.push_back(common_path_node); } // Find the longest suffix. ContigPath vspath; size_t longestSuffix; bool commonSuffix = true; for (longestSuffix = 0; longestSuffix < min_len-longestPrefix; longestSuffix++) { const ContigNode& common_path_node = firstSol[firstSol.size()-longestSuffix-1]; for (ContigPaths::const_iterator solIter = paths.begin(); solIter != paths.end(); ++solIter) { const ContigNode& pathnode = (*solIter)[solIter->size()-longestSuffix-1]; if (pathnode != common_path_node) { // Found the longest suffix. commonSuffix = false; break; } } if (!commonSuffix) break; vspath.push_back(common_path_node); } ContigPath out; out.reserve(vppath.size() + 1 + vspath.size()); out.insert(out.end(), vppath.begin(), vppath.end()); if (longestSuffix > 0) { const ContigPath& longestPath( *max_element(paths.begin(), paths.end(), ComparePathLength(g, origin))); unsigned length = calculatePathLength(g, origin, longestPath, longestPrefix, longestSuffix); // Account for the overlap on the right. int dist = length + getDistance(g, longestSuffix == longestPath.size() ? origin : *(longestPath.rbegin() + longestSuffix), *(longestPath.rbegin() + longestSuffix - 1)); // Add k-1 because it is the convention. int numN = dist + opt::k - 1; assert(numN > 0); out.push_back(ContigNode(numN, 'N')); out.insert(out.end(), vspath.rbegin(), vspath.rend()); } return out; }
/** Return the consensus sequence of the specified gap. */ static ContigPath fillGap(const Graph& g, const AmbPathConstraint& apConstraint, vector<bool>& seen, ofstream& outFasta) { if (opt::verbose > 1) cerr << "\n* " << get(vertex_name, g, apConstraint.source) << ' ' << apConstraint.dist << "N " << get(vertex_name, g, apConstraint.dest) << '\n'; Constraints constraints; constraints.push_back(Constraint(apConstraint.dest, apConstraint.dist + opt::distanceError)); ContigPaths solutions; unsigned numVisited = 0; constrainedSearch(g, apConstraint.source, constraints, solutions, numVisited); bool tooComplex = numVisited >= opt::maxCost; for (ContigPaths::iterator solIt = solutions.begin(); solIt != solutions.end(); solIt++) solIt->insert(solIt->begin(), apConstraint.source); ContigPath consensus; bool tooManySolutions = solutions.size() > opt::numBranches; if (tooComplex) { stats.tooComplex++; if (opt::verbose > 1) cerr << solutions.size() << " paths (too complex)\n"; } else if (tooManySolutions) { stats.numTooManySolutions++; if (opt::verbose > 1) cerr << solutions.size() << " paths (too many)\n"; } else if (solutions.empty()) { stats.numNoSolutions++; if (opt::verbose > 1) cerr << "no paths\n"; } else if (solutions.size() == 1) { if (opt::verbose > 1) cerr << "1 path\n" << solutions.front() << '\n'; stats.numMerged++; } else { assert(solutions.size() > 1); if (opt::verbose > 2) copy(solutions.begin(), solutions.end(), ostream_iterator<ContigPath>(cerr, "\n")); else if (opt::verbose > 1) cerr << solutions.size() << " paths\n"; consensus = align(g, solutions, outFasta); if (!consensus.empty()) { stats.numMerged++; // Mark contigs that are used in a consensus. markSeen(seen, solutions, true); if (opt::verbose > 1) cerr << consensus << '\n'; } else stats.notMerged++; } return consensus; }
/* Resolve ambiguous region using pairwise alignment * (Needleman-Wunsch) ('solutions' contain exactly two paths, from a * source contig to a dest contig) */ static ContigPath alignPair(const Graph& g, const ContigPaths& solutions, ofstream& out) { assert(solutions.size() == 2); assert(solutions[0].size() > 1); assert(solutions[1].size() > 1); assert(solutions[0].front() == solutions[1].front()); assert(solutions[0].back() == solutions[1].back()); ContigPath fstSol(solutions[0].begin()+1, solutions[0].end()-1); ContigPath sndSol(solutions[1].begin()+1, solutions[1].end()-1); if (fstSol.empty() || sndSol.empty()) { // This entire sequence may be deleted. const ContigPath& sol(fstSol.empty() ? sndSol : fstSol); assert(!sol.empty()); Sequence consensus(mergePath(g, sol)); assert(consensus.size() > opt::k - 1); string::iterator first = consensus.begin() + opt::k - 1; transform(first, consensus.end(), first, ::tolower); unsigned match = opt::k - 1; float identity = (float)match / consensus.size(); if (opt::verbose > 2) cerr << consensus << '\n'; if (opt::verbose > 1) cerr << identity << (identity < opt::identity ? " (too low)\n" : "\n"); if (identity < opt::identity) return ContigPath(); unsigned coverage = calculatePathProperties(g, sol).coverage; ContigNode u = outputNewContig(g, solutions, 1, 1, consensus, coverage, out); ContigPath path; path.push_back(solutions.front().front()); path.push_back(u); path.push_back(solutions.front().back()); return path; } Sequence fstPathContig(mergePath(g, fstSol)); Sequence sndPathContig(mergePath(g, sndSol)); if (fstPathContig == sndPathContig) { // These two paths have identical sequence. if (fstSol.size() == sndSol.size()) { // A perfect match must be caused by palindrome. typedef ContigPath::const_iterator It; pair<It, It> it = mismatch( fstSol.begin(), fstSol.end(), sndSol.begin()); assert(it.first != fstSol.end()); assert(it.second != sndSol.end()); assert(*it.first == get(vertex_complement, g, *it.second)); assert(equal(it.first+1, It(fstSol.end()), it.second+1)); if (opt::verbose > 1) cerr << "Palindrome: " << get(vertex_contig_name, g, *it.first) << '\n'; return solutions[0]; } else { // The paths are different lengths. cerr << PROGRAM ": warning: " "Two paths have identical sequence, which may be " "caused by a transitive edge in the overlap graph.\n" << '\t' << fstSol << '\n' << '\t' << sndSol << '\n'; return solutions[fstSol.size() > sndSol.size() ? 0 : 1]; } } unsigned minLength = min( fstPathContig.length(), sndPathContig.length()); unsigned maxLength = max( fstPathContig.length(), sndPathContig.length()); float lengthRatio = (float)minLength / maxLength; if (lengthRatio < opt::identity) { if (opt::verbose > 1) cerr << minLength << '\t' << maxLength << '\t' << lengthRatio << "\t(different length)\n"; return ContigPath(); } NWAlignment align; unsigned match = alignGlobal(fstPathContig, sndPathContig, align); float identity = (float)match / align.size(); if (opt::verbose > 2) cerr << align; if (opt::verbose > 1) cerr << identity << (identity < opt::identity ? " (too low)\n" : "\n"); if (identity < opt::identity) return ContigPath(); unsigned coverage = calculatePathProperties(g, fstSol).coverage + calculatePathProperties(g, sndSol).coverage; ContigNode u = outputNewContig(g, solutions, 1, 1, align.consensus(), coverage, out); ContigPath path; path.push_back(solutions.front().front()); path.push_back(u); path.push_back(solutions.front().back()); return path; }