/** Add distances to a path. */ static ContigPath addDistance(const Graph& g, const ContigPath& path) { ContigPath out; out.reserve(path.size()); ContigNode u = path.front(); out.push_back(u); for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) { ContigNode v = *it; int distance = getDistance(g, u, v); if (distance >= 0) { int numN = distance + opt::k - 1; // by convention assert(numN >= 0); numN = max(numN, 1); out.push_back(ContigNode(numN, 'N')); } out.push_back(v); u = v; } return out; }
static void* worker(void* pArg) { WorkerArg& arg = *static_cast<WorkerArg*>(pArg); for (;;) { /** Lock the input stream. */ static pthread_mutex_t inMutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&inMutex); EstimateRecord er; bool good = (*arg.in) >> er; pthread_mutex_unlock(&inMutex); if (!good) break; // Flip the anterior distance estimates. for (Estimates::iterator it = er.estimates[1].begin(); it != er.estimates[1].end(); ++it) it->first ^= 1; ContigPath path; handleEstimate(*arg.graph, er, true, path); reverseComplement(path.begin(), path.end()); path.push_back(ContigNode(er.refID, false)); handleEstimate(*arg.graph, er, false, path); if (path.size() > 1) { /** Lock the output stream. */ static pthread_mutex_t outMutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&outMutex); *arg.out << get(g_contigNames, er.refID) << '\t' << path << '\n'; assert(arg.out->good()); pthread_mutex_unlock(&outMutex); } } return NULL; }
/** Return an ambiguous path that agrees with all the given paths. */ static ContigPath constructAmbiguousPath(const Graph &g, const ContigNode& origin, const ContigPaths& paths) { assert(!paths.empty()); // Find the size of the smallest path. const ContigPath& firstSol = paths.front(); size_t min_len = firstSol.size(); for (ContigPaths::const_iterator it = paths.begin() + 1; it != paths.end(); ++it) min_len = min(min_len, it->size()); // Find the longest prefix. ContigPath vppath; size_t longestPrefix; bool commonPrefix = true; for (longestPrefix = 0; longestPrefix < min_len; longestPrefix++) { const ContigNode& common_path_node = firstSol[longestPrefix]; for (ContigPaths::const_iterator solIter = paths.begin(); solIter != paths.end(); ++solIter) { const ContigNode& pathnode = (*solIter)[longestPrefix]; if (pathnode != common_path_node) { // Found the longest prefix. commonPrefix = false; break; } } if (!commonPrefix) break; vppath.push_back(common_path_node); } // Find the longest suffix. ContigPath vspath; size_t longestSuffix; bool commonSuffix = true; for (longestSuffix = 0; longestSuffix < min_len-longestPrefix; longestSuffix++) { const ContigNode& common_path_node = firstSol[firstSol.size()-longestSuffix-1]; for (ContigPaths::const_iterator solIter = paths.begin(); solIter != paths.end(); ++solIter) { const ContigNode& pathnode = (*solIter)[solIter->size()-longestSuffix-1]; if (pathnode != common_path_node) { // Found the longest suffix. commonSuffix = false; break; } } if (!commonSuffix) break; vspath.push_back(common_path_node); } ContigPath out; out.reserve(vppath.size() + 1 + vspath.size()); out.insert(out.end(), vppath.begin(), vppath.end()); if (longestSuffix > 0) { const ContigPath& longestPath( *max_element(paths.begin(), paths.end(), ComparePathLength(g, origin))); unsigned length = calculatePathLength(g, origin, longestPath, longestPrefix, longestSuffix); // Account for the overlap on the right. int dist = length + getDistance(g, longestSuffix == longestPath.size() ? origin : *(longestPath.rbegin() + longestSuffix), *(longestPath.rbegin() + longestSuffix - 1)); // Add k-1 because it is the convention. int numN = dist + opt::k - 1; assert(numN > 0); out.push_back(ContigNode(numN, 'N')); out.insert(out.end(), vspath.rbegin(), vspath.rend()); } return out; }
/* Resolve ambiguous region using pairwise alignment * (Needleman-Wunsch) ('solutions' contain exactly two paths, from a * source contig to a dest contig) */ static ContigPath alignPair(const Graph& g, const ContigPaths& solutions, ofstream& out) { assert(solutions.size() == 2); assert(solutions[0].size() > 1); assert(solutions[1].size() > 1); assert(solutions[0].front() == solutions[1].front()); assert(solutions[0].back() == solutions[1].back()); ContigPath fstSol(solutions[0].begin()+1, solutions[0].end()-1); ContigPath sndSol(solutions[1].begin()+1, solutions[1].end()-1); if (fstSol.empty() || sndSol.empty()) { // This entire sequence may be deleted. const ContigPath& sol(fstSol.empty() ? sndSol : fstSol); assert(!sol.empty()); Sequence consensus(mergePath(g, sol)); assert(consensus.size() > opt::k - 1); string::iterator first = consensus.begin() + opt::k - 1; transform(first, consensus.end(), first, ::tolower); unsigned match = opt::k - 1; float identity = (float)match / consensus.size(); if (opt::verbose > 2) cerr << consensus << '\n'; if (opt::verbose > 1) cerr << identity << (identity < opt::identity ? " (too low)\n" : "\n"); if (identity < opt::identity) return ContigPath(); unsigned coverage = calculatePathProperties(g, sol).coverage; ContigNode u = outputNewContig(g, solutions, 1, 1, consensus, coverage, out); ContigPath path; path.push_back(solutions.front().front()); path.push_back(u); path.push_back(solutions.front().back()); return path; } Sequence fstPathContig(mergePath(g, fstSol)); Sequence sndPathContig(mergePath(g, sndSol)); if (fstPathContig == sndPathContig) { // These two paths have identical sequence. if (fstSol.size() == sndSol.size()) { // A perfect match must be caused by palindrome. typedef ContigPath::const_iterator It; pair<It, It> it = mismatch( fstSol.begin(), fstSol.end(), sndSol.begin()); assert(it.first != fstSol.end()); assert(it.second != sndSol.end()); assert(*it.first == get(vertex_complement, g, *it.second)); assert(equal(it.first+1, It(fstSol.end()), it.second+1)); if (opt::verbose > 1) cerr << "Palindrome: " << get(vertex_contig_name, g, *it.first) << '\n'; return solutions[0]; } else { // The paths are different lengths. cerr << PROGRAM ": warning: " "Two paths have identical sequence, which may be " "caused by a transitive edge in the overlap graph.\n" << '\t' << fstSol << '\n' << '\t' << sndSol << '\n'; return solutions[fstSol.size() > sndSol.size() ? 0 : 1]; } } unsigned minLength = min( fstPathContig.length(), sndPathContig.length()); unsigned maxLength = max( fstPathContig.length(), sndPathContig.length()); float lengthRatio = (float)minLength / maxLength; if (lengthRatio < opt::identity) { if (opt::verbose > 1) cerr << minLength << '\t' << maxLength << '\t' << lengthRatio << "\t(different length)\n"; return ContigPath(); } NWAlignment align; unsigned match = alignGlobal(fstPathContig, sndPathContig, align); float identity = (float)match / align.size(); if (opt::verbose > 2) cerr << align; if (opt::verbose > 1) cerr << identity << (identity < opt::identity ? " (too low)\n" : "\n"); if (identity < opt::identity) return ContigPath(); unsigned coverage = calculatePathProperties(g, fstSol).coverage + calculatePathProperties(g, sndSol).coverage; ContigNode u = outputNewContig(g, solutions, 1, 1, align.consensus(), coverage, out); ContigPath path; path.push_back(solutions.front().front()); path.push_back(u); path.push_back(solutions.front().back()); return path; }