/** Remove ambiguous edges that overlap by only a small amount. * Remove the edge (u,v) if deg+(u) > 1 and deg-(v) > 1 and the * overlap of (u,v) is small. */ static void removeSmallOverlaps(PathGraph& g, const ContigPathMap& paths) { typedef graph_traits<PathGraph>::edge_descriptor E; typedef graph_traits<PathGraph>::out_edge_iterator Eit; typedef graph_traits<PathGraph>::vertex_descriptor V; typedef graph_traits<PathGraph>::vertex_iterator Vit; vector<E> edges; pair<Vit, Vit> urange = vertices(g); for (Vit uit = urange.first; uit != urange.second; ++uit) { V u = *uit; if (out_degree(u, g) < 2) continue; ContigPath pathu = getPath(paths, u); pair<Eit, Eit> uvits = out_edges(u, g); for (Eit uvit = uvits.first; uvit != uvits.second; ++uvit) { E uv = *uvit; V v = target(uv, g); assert(v != u); if (in_degree(v, g) < 2) continue; ContigPath pathv = getPath(paths, v); if (pathu.back() == pathv.front() && paths.count(pathu.back().contigIndex()) > 0) edges.push_back(uv); } } remove_edges(g, edges.begin(), edges.end()); if (opt::verbose > 0) cout << "Removed " << edges.size() << " small overlap edges.\n"; if (!opt::db.empty()) addToDb(db, "Edges_removed_small_overlap", edges.size()); }
/** Remove ambiguous contigs from the ends of the path. */ static void removeAmbiguousContigs(ContigPath& path) { if (!path.empty() && path.back().ambiguous()) path.erase(path.end() - 1); if (!path.empty() && path.front().ambiguous()) path.erase(path.begin()); }
/** Return a FASTA comment for the specified path. */ static void pathToComment(ostream& out, const Graph& g, const ContigPath& path) { assert(path.size() > 1); out << get(vertex_name, g, path.front()); if (path.size() == 3) out << ',' << get(vertex_name, g, path[1]); else if (path.size() > 3) out << ",..."; out << ',' << get(vertex_name, g, path.back()); }
/** Find a path for the specified distance estimates. * @param out [out] the solution path */ static void handleEstimate(const Graph& g, const EstimateRecord& er, bool dirIdx, ContigPath& out) { if (er.estimates[dirIdx].empty()) return; ContigNode origin(er.refID, dirIdx); ostringstream vout_ss; ostream bitBucket(NULL); ostream& vout = opt::verbose > 0 ? vout_ss : bitBucket; vout << "\n* " << get(vertex_name, g, origin) << '\n'; unsigned minNumPairs = UINT_MAX; // generate the reachable set Constraints constraints; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; minNumPairs = min(minNumPairs, ep.numPairs); constraints.push_back(Constraint(v, ep.distance + allowedError(ep.stdDev))); } vout << "Constraints:"; printConstraints(vout, g, constraints) << '\n'; ContigPaths solutions; unsigned numVisited = 0; constrainedSearch(g, origin, constraints, solutions, numVisited); bool tooComplex = numVisited >= opt::maxCost; bool tooManySolutions = solutions.size() > opt::maxPaths; set<ContigID> repeats = findRepeats(er.refID, solutions); if (!repeats.empty()) { vout << "Repeats:"; for (set<ContigID>::const_iterator it = repeats.begin(); it != repeats.end(); ++it) vout << ' ' << get(g_contigNames, *it); vout << '\n'; } unsigned numPossiblePaths = solutions.size(); if (numPossiblePaths > 0) vout << "Paths: " << numPossiblePaths << '\n'; for (ContigPaths::iterator solIter = solutions.begin(); solIter != solutions.end();) { vout << *solIter << '\n'; // Calculate the path distance to each node and see if // it is within the estimated distance. map<ContigNode, int> distanceMap = makeDistanceMap(g, origin, *solIter); // Remove solutions whose distance estimates are not correct. unsigned validCount = 0, invalidCount = 0, ignoredCount = 0; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; vout << get(vertex_name, g, v) << ',' << ep << '\t'; map<ContigNode, int>::iterator dmIter = distanceMap.find(v); if (dmIter == distanceMap.end()) { // This contig is a repeat. ignoredCount++; vout << "ignored\n"; continue; } // translate distance by -overlap to match // coordinate space used by the estimate int actualDistance = dmIter->second; int diff = actualDistance - ep.distance; unsigned buffer = allowedError(ep.stdDev); bool invalid = (unsigned)abs(diff) > buffer; bool repeat = repeats.count(v.contigIndex()) > 0; bool ignored = invalid && repeat; if (ignored) ignoredCount++; else if (invalid) invalidCount++; else validCount++; vout << "dist: " << actualDistance << " diff: " << diff << " buffer: " << buffer << " n: " << ep.numPairs << (ignored ? " ignored" : invalid ? " invalid" : "") << '\n'; } if (invalidCount == 0 && validCount > 0) ++solIter; else solIter = solutions.erase(solIter); } vout << "Solutions: " << solutions.size(); if (tooComplex) vout << " (too complex)"; if (tooManySolutions) vout << " (too many solutions)"; vout << '\n'; ContigPaths::iterator bestSol = solutions.end(); int minDiff = 999999; for (ContigPaths::iterator solIter = solutions.begin(); solIter != solutions.end(); ++solIter) { map<ContigNode, int> distanceMap = makeDistanceMap(g, origin, *solIter); int sumDiff = 0; for (Estimates::const_iterator iter = er.estimates[dirIdx].begin(); iter != er.estimates[dirIdx].end(); ++iter) { ContigNode v = iter->first; const DistanceEst& ep = iter->second; if (repeats.count(v.contigIndex()) > 0) continue; map<ContigNode, int>::iterator dmIter = distanceMap.find(v); assert(dmIter != distanceMap.end()); int actualDistance = dmIter->second; int diff = actualDistance - ep.distance; sumDiff += abs(diff); } if (sumDiff < minDiff) { minDiff = sumDiff; bestSol = solIter; } vout << *solIter << " length: " << calculatePathLength(g, origin, *solIter) << " sumdiff: " << sumDiff << '\n'; } /** Lock the debugging stream. */ static pthread_mutex_t coutMutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&coutMutex); stats.totalAttempted++; g_minNumPairs = min(g_minNumPairs, minNumPairs); if (tooComplex) { stats.tooComplex++; } else if (tooManySolutions) { stats.tooManySolutions++; } else if (numPossiblePaths == 0) { stats.noPossiblePaths++; } else if (solutions.empty()) { stats.noValidPaths++; } else if (repeats.count(er.refID) > 0) { vout << "Repeat: " << get(vertex_name, g, origin) << '\n'; stats.repeat++; } else if (solutions.size() > 1) { ContigPath path = constructAmbiguousPath(g, origin, solutions); if (!path.empty()) { if (opt::extend) extend(g, path.back(), back_inserter(path)); vout << path << '\n'; if (opt::scaffold) { out.insert(out.end(), path.begin(), path.end()); g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs); } } stats.multiEnd++; } else { assert(solutions.size() == 1); assert(bestSol != solutions.end()); ContigPath& path = *bestSol; if (opt::verbose > 1) printDistanceMap(vout, g, origin, path); if (opt::extend) extend(g, path.back(), back_inserter(path)); out.insert(out.end(), path.begin(), path.end()); stats.uniqueEnd++; g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs); } cout << vout_ss.str(); if (!out.empty()) assert(!out.back().ambiguous()); pthread_mutex_unlock(&coutMutex); }