/** Attempt to merge the paths specified in mergeQ with path.
 * @return the number of paths merged
 */
static unsigned mergePaths(const Lengths& lengths,
		ContigPath& path,
		deque<ContigNode>& mergeQ, set<ContigNode>& seen,
		const ContigPathMap& paths)
{
	unsigned merged = 0;
	deque<ContigNode> invalid;
	for (ContigNode pivot; !mergeQ.empty(); mergeQ.pop_front()) {
		pivot = mergeQ.front();
		ContigPathMap::const_iterator path2It
			= paths.find(pivot.contigIndex());
		if (path2It == paths.end())
			continue;

		ContigPath path2 = path2It->second;
		if (pivot.sense())
			reverseComplement(path2.begin(), path2.end());
		ContigPath consensus = align(lengths, path, path2, pivot);
		if (consensus.empty()) {
			invalid.push_back(pivot);
			continue;
		}

		appendToMergeQ(mergeQ, seen, path2);
		path.swap(consensus);
		if (gDebugPrint)
#pragma omp critical(cout)
			cout << get(g_contigNames, pivot)
				<< '\t' << path2 << '\n'
				<< '\t' << path << '\n';
		merged++;
	}
	mergeQ.swap(invalid);
	return merged;
}
Beispiel #2
0
/** Return the properties of the specified vertex, unless u is
 * ambiguous, in which case return the length of the ambiguous
 * sequence.
 */
static inline
ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u)
{
	return u.ambiguous()
		? ContigProperties(u.length() + opt::k - 1, 0)
		: g[u];
}
Beispiel #3
0
/** Return a path, complemented if necessary. */
static ContigPath getPath(const Paths& paths, const ContigNode& u)
{
	if (isPath(u)) {
		unsigned i = u.id() - Vertex::s_offset;
		return u.sense() ? reverseComplement(paths[i]) : paths[i];
	} else
		return ContigPath(1, u);
}
/** Return the specified path. */
static ContigPath getPath(const ContigPathMap& paths, ContigNode u)
{
	ContigPathMap::const_iterator it = paths.find(u.contigIndex());
	assert(it != paths.end());
	ContigPath path = it->second;
	if (u.sense())
		reverseComplement(path.begin(), path.end());
	return path;
}
Beispiel #5
0
/** Return the sequence of the specified contig node. The sequence
 * may be ambiguous or reverse complemented.
 */
static Sequence sequence(const Contigs& contigs, const ContigNode& id)
{
	if (id.ambiguous()) {
		string s(id.ambiguousSequence());
		if (s.length() < opt::k)
			transform(s.begin(), s.end(), s.begin(), ::tolower);
		return string(opt::k - 1, 'N') + s;
	} else {
		const Sequence& seq = contigs[id.id()].seq;
		return id.sense() ? reverseComplement(seq) : seq;
	}
}
Beispiel #6
0
/** Return the sequence of the specified contig node. The sequence
 * may be ambiguous or reverse complemented.
 */
static const Sequence getSequence(ContigNode id)
{
	if (id.ambiguous()) {
		string s(id.ambiguousSequence());
		if (s.length() < opt::k)
			transform(s.begin(), s.end(), s.begin(), ::tolower);
		return string(opt::k - 1, 'N') + s;
	} else {
		string seq(g_contigs[id.id()]);
		return id.sense() ? reverseComplement(seq) : seq;
	}
}
Beispiel #7
0
static void findOverlap(const Graph& g,
		ContigID refID, bool rc,
		const ContigNode& pair,
		const DistanceEst& est,
		OverlapGraph& out)
{
	if (refID == pair.id()
			|| (est.distance >= 0 && !opt::scaffold))
		return;
	ContigNode ref(refID, false);
	const ContigNode& t = rc ? pair : ref;
	const ContigNode& h = rc ? ref : pair;
	if (out_degree(t, g) > 0 || in_degree(h, g) > 0
			|| edge(t, h, out).second)
		return;

	bool mask = false;
	unsigned overlap
		= est.distance - (int)allowedError(est.stdDev) <= 0
		? findOverlap(g, t, h, mask) : 0;
	if (mask && !opt::mask)
		return;
	if (overlap > 0 || opt::scaffold)
		add_edge(t, h, Overlap(est, overlap, mask), out);
}
Beispiel #8
0
static void writeEstimate(ostream& out,
		const ContigNode& id0, const ContigNode& id1,
		unsigned len0, unsigned len1,
		const Pairs& pairs, const PMF& pmf)
{
	if (pairs.size() < opt::npairs)
		return;

	DistanceEst est;
	est.distance = estimateDistance(len0, len1,
			pairs, pmf, est.numPairs);
	est.stdDev = pmf.getSampleStdDev(est.numPairs);

	std::pair<ContigNode, ContigNode> e(id0, id1 ^ id0.sense());
	if (est.numPairs >= opt::npairs) {
		if (opt::format == DOT) {
#pragma omp critical(out)
			out << get(g_contigNames, e) << " [" << est << "]\n";
		} else
			out << ' ' << get(g_contigNames, id1) << ',' << est;
	} else if (opt::verbose > 1) {
#pragma omp critical(cerr)
		cerr << "warning: " << get(g_contigNames, e)
			<< " [d=" << est.distance << "] "
			<< est.numPairs << " of " << pairs.size()
			<< " pairs fit the expected distribution\n";
	}
}
Beispiel #9
0
/** Add the overlaps of vseq to the graph. */
static void addOverlapsSA(Graph& g, const SuffixArray& sa,
		ContigNode v, const string& vseq)
{
	assert(!vseq.empty());
	set<ContigNode> seen;
	typedef SuffixArray::const_iterator It;
	for (string q(vseq, 0, vseq.size() - 1);
			q.size() >= opt::minOverlap; chop(q)) {
		pair<It, It> range = sa.equal_range(q);
		for (It it = range.first; it != range.second; ++it) {
			ContigNode u(it->second);
			if (opt::ss && u.sense() != v.sense())
				continue;
			if (seen.insert(u).second) {
				// Add the longest overlap between two vertices.
				unsigned overlap = it->first.size();
				add_edge(u, v, -overlap, static_cast<DG&>(g));
			}
		}
	}
}
Beispiel #10
0
/** Find a path for the specified distance estimates.
 * @param out [out] the solution path
 */
static void handleEstimate(const Graph& g,
		const EstimateRecord& er, bool dirIdx,
		ContigPath& out)
{
	if (er.estimates[dirIdx].empty())
		return;

	ContigNode origin(er.refID, dirIdx);
	ostringstream vout_ss;
	ostream bitBucket(NULL);
	ostream& vout = opt::verbose > 0 ? vout_ss : bitBucket;
	vout << "\n* " << get(vertex_name, g, origin) << '\n';

	unsigned minNumPairs = UINT_MAX;
	// generate the reachable set
	Constraints constraints;
	for (Estimates::const_iterator iter
				= er.estimates[dirIdx].begin();
			iter != er.estimates[dirIdx].end(); ++iter) {
		ContigNode v = iter->first;
		const DistanceEst& ep = iter->second;
		minNumPairs = min(minNumPairs, ep.numPairs);
		constraints.push_back(Constraint(v,
					ep.distance + allowedError(ep.stdDev)));
	}

	vout << "Constraints:";
	printConstraints(vout, g, constraints) << '\n';

	ContigPaths solutions;
	unsigned numVisited = 0;
	constrainedSearch(g, origin, constraints, solutions, numVisited);
	bool tooComplex = numVisited >= opt::maxCost;
	bool tooManySolutions = solutions.size() > opt::maxPaths;

	set<ContigID> repeats = findRepeats(er.refID, solutions);
	if (!repeats.empty()) {
		vout << "Repeats:";
		for (set<ContigID>::const_iterator it = repeats.begin();
				it != repeats.end(); ++it)
			vout << ' ' << get(g_contigNames, *it);
		vout << '\n';
	}

	unsigned numPossiblePaths = solutions.size();
	if (numPossiblePaths > 0)
		vout << "Paths: " << numPossiblePaths << '\n';

	for (ContigPaths::iterator solIter = solutions.begin();
			solIter != solutions.end();) {
		vout << *solIter << '\n';

		// Calculate the path distance to each node and see if
		// it is within the estimated distance.
		map<ContigNode, int> distanceMap
			= makeDistanceMap(g, origin, *solIter);

		// Remove solutions whose distance estimates are not correct.
		unsigned validCount = 0, invalidCount = 0, ignoredCount = 0;
		for (Estimates::const_iterator iter
					= er.estimates[dirIdx].begin();
				iter != er.estimates[dirIdx].end(); ++iter) {
			ContigNode v = iter->first;
			const DistanceEst& ep = iter->second;
			vout << get(vertex_name, g, v) << ',' << ep << '\t';

			map<ContigNode, int>::iterator dmIter
				= distanceMap.find(v);
			if (dmIter == distanceMap.end()) {
				// This contig is a repeat.
				ignoredCount++;
				vout << "ignored\n";
				continue;
			}

			// translate distance by -overlap to match
			// coordinate space used by the estimate
			int actualDistance = dmIter->second;
			int diff = actualDistance - ep.distance;
			unsigned buffer = allowedError(ep.stdDev);
			bool invalid = (unsigned)abs(diff) > buffer;
			bool repeat = repeats.count(v.contigIndex()) > 0;
			bool ignored = invalid && repeat;
			if (ignored)
				ignoredCount++;
			else if (invalid)
				invalidCount++;
			else
				validCount++;
			vout << "dist: " << actualDistance
				<< " diff: " << diff
				<< " buffer: " << buffer
				<< " n: " << ep.numPairs
				<< (ignored ? " ignored" : invalid ? " invalid" : "")
				<< '\n';
		}

		if (invalidCount == 0 && validCount > 0)
			++solIter;
		else
			solIter = solutions.erase(solIter);
	}

	vout << "Solutions: " << solutions.size();
	if (tooComplex)
		vout << " (too complex)";
	if (tooManySolutions)
		vout << " (too many solutions)";
	vout << '\n';

	ContigPaths::iterator bestSol = solutions.end();
	int minDiff = 999999;
	for (ContigPaths::iterator solIter = solutions.begin();
			solIter != solutions.end(); ++solIter) {
		map<ContigNode, int> distanceMap
			= makeDistanceMap(g, origin, *solIter);
		int sumDiff = 0;
		for (Estimates::const_iterator iter
					= er.estimates[dirIdx].begin();
				iter != er.estimates[dirIdx].end(); ++iter) {
			ContigNode v = iter->first;
			const DistanceEst& ep = iter->second;
			if (repeats.count(v.contigIndex()) > 0)
				continue;
			map<ContigNode, int>::iterator dmIter
				= distanceMap.find(v);
			assert(dmIter != distanceMap.end());
			int actualDistance = dmIter->second;
			int diff = actualDistance - ep.distance;
			sumDiff += abs(diff);
		}

		if (sumDiff < minDiff) {
			minDiff = sumDiff;
			bestSol = solIter;
		}

		vout << *solIter
			<< " length: " << calculatePathLength(g, origin, *solIter)
			<< " sumdiff: " << sumDiff << '\n';
	}

	/** Lock the debugging stream. */
	static pthread_mutex_t coutMutex = PTHREAD_MUTEX_INITIALIZER;
	pthread_mutex_lock(&coutMutex);
	stats.totalAttempted++;
	g_minNumPairs = min(g_minNumPairs, minNumPairs);

	if (tooComplex) {
		stats.tooComplex++;
	} else if (tooManySolutions) {
		stats.tooManySolutions++;
	} else if (numPossiblePaths == 0) {
		stats.noPossiblePaths++;
	} else if (solutions.empty()) {
		stats.noValidPaths++;
	} else if (repeats.count(er.refID) > 0) {
		vout << "Repeat: " << get(vertex_name, g, origin) << '\n';
		stats.repeat++;
	} else if (solutions.size() > 1) {
		ContigPath path
			= constructAmbiguousPath(g, origin, solutions);
		if (!path.empty()) {
			if (opt::extend)
				extend(g, path.back(), back_inserter(path));
			vout << path << '\n';
			if (opt::scaffold) {
				out.insert(out.end(), path.begin(), path.end());
				g_minNumPairsUsed
					= min(g_minNumPairsUsed, minNumPairs);
			}
		}
		stats.multiEnd++;
	} else {
		assert(solutions.size() == 1);
		assert(bestSol != solutions.end());
		ContigPath& path = *bestSol;
		if (opt::verbose > 1)
			printDistanceMap(vout, g, origin, path);
		if (opt::extend)
			extend(g, path.back(), back_inserter(path));
		out.insert(out.end(), path.begin(), path.end());
		stats.uniqueEnd++;
		g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs);
	}
	cout << vout_ss.str();
	if (!out.empty())
		assert(!out.back().ambiguous());
	pthread_mutex_unlock(&coutMutex);
}
Beispiel #11
0
/** Return the sequence of the specified contig. */
static string sequence(const ContigNode& id)
{
	const string& seq = g_contigs[id.id()];
	return id.sense() ? reverseComplement(seq) : seq;
}
Beispiel #12
0
/** Return whether this vertex is a path or a contig. */
static bool isPath(const ContigNode& u)
{
	return u.id() >= Vertex::s_offset;
}
/** Identify paths subsumed by the specified path.
 * @param overlaps [out] paths that are found to overlap
 * @return the ID of the subsuming path
 */
static ContigID identifySubsumedPaths(const Lengths& lengths,
		ContigPathMap::const_iterator path1It,
		ContigPathMap& paths,
		set<ContigID>& out,
		set<ContigID>& overlaps)
{
	ostringstream vout;
	out.clear();
	ContigID id(path1It->first);
	const ContigPath& path = path1It->second;
	if (gDebugPrint)
		vout << get(g_contigNames, ContigNode(id, false))
			<< '\t' << path << '\n';

	for (ContigPath::const_iterator it = path.begin();
			it != path.end(); ++it) {
		ContigNode pivot = *it;
		if (pivot.ambiguous() || pivot.id() == id)
			continue;
		ContigPathMap::iterator path2It
			= paths.find(pivot.contigIndex());
		if (path2It == paths.end())
			continue;
		ContigPath path2 = path2It->second;
		if (pivot.sense())
			reverseComplement(path2.begin(), path2.end());
		ContigPath consensus = align(lengths, path, path2, pivot);
		if (consensus.empty())
			continue;
		if (equalIgnoreAmbiguos(consensus, path)) {
			if (gDebugPrint)
				vout << get(g_contigNames, pivot)
					<< '\t' << path2 << '\n';
			out.insert(path2It->first);
		} else if (equalIgnoreAmbiguos(consensus, path2)) {
			// This path is larger. Use it as the seed.
			return identifySubsumedPaths(lengths, path2It, paths, out,
					overlaps);
		} else if (isCycle(lengths, consensus)) {
			// The consensus path is a cycle.
			bool isCyclePath1 = isCycle(lengths, path);
			bool isCyclePath2 = isCycle(lengths, path2);
			if (!isCyclePath1 && !isCyclePath2) {
				// Neither path is a cycle.
				if (gDebugPrint)
					vout << get(g_contigNames, pivot)
						<< '\t' << path2 << '\n'
						<< "ignored\t" << consensus << '\n';
				overlaps.insert(id);
				overlaps.insert(path2It->first);
			} else {
				// At least one path is a cycle.
				if (gDebugPrint)
					vout << get(g_contigNames, pivot)
						<< '\t' << path2 << '\n'
						<< "cycle\t" << consensus << '\n';
				if (isCyclePath1 && isCyclePath2)
					out.insert(path2It->first);
				else if (!isCyclePath1)
					overlaps.insert(id);
				else if (!isCyclePath2)
					overlaps.insert(path2It->first);
			}
		} else {
			if (gDebugPrint)
				vout << get(g_contigNames, pivot)
					<< '\t' << path2 << '\n'
					<< "ignored\t" << consensus << '\n';
			overlaps.insert(id);
			overlaps.insert(path2It->first);
		}
	}
	cout << vout.str();
	return id;
}
/** Return true if the contigs are equal or both are ambiguous. */
static bool equalOrBothAmbiguos(const ContigNode& a,
		const ContigNode& b)
{
	return a == b || (a.ambiguous() && b.ambiguous());
}
/** Return the length of the specified contig in k-mer. */
static unsigned getLength(const Lengths& lengths,
		const ContigNode& u)
{
	return u.ambiguous() ? u.length()
		: lengths.at(u.id());
}