Пример #1
0
/** Remove ambiguous contigs from the ends of the path. */
static void removeAmbiguousContigs(ContigPath& path)
{
	if (!path.empty() && path.back().ambiguous())
		path.erase(path.end() - 1);
	if (!path.empty() && path.front().ambiguous())
		path.erase(path.begin());
}
Пример #2
0
/** Merge the paths of the specified seed path.
 * @return the merged contig path
 */
static ContigPath mergePath(const Lengths& lengths,
		const ContigPathMap& paths, const ContigPath& seedPath)
{
	assert(!seedPath.empty());
	ContigNode seed1 = seedPath.front();
	ContigPathMap::const_iterator path1It
		= paths.find(seed1.contigIndex());
	assert(path1It != paths.end());
	ContigPath path(path1It->second);
	if (seedPath.front().sense())
		reverseComplement(path.begin(), path.end());
	if (opt::verbose > 1)
#pragma omp critical(cout)
		cout << "\n* " << seedPath << '\n'
			<< get(g_contigNames, seedPath.front())
			<< '\t' << path << '\n';
	for (ContigPath::const_iterator it = seedPath.begin() + 1;
			it != seedPath.end(); ++it) {
		ContigNode seed2 = *it;
		ContigPathMap::const_iterator path2It
			= paths.find(seed2.contigIndex());
		assert(path2It != paths.end());
		ContigPath path2 = path2It->second;
		if (seed2.sense())
			reverseComplement(path2.begin(), path2.end());

		ContigNode pivot
			= find(path.begin(), path.end(), seed2) != path.end()
			? seed2 : seed1;
		ContigPath consensus = align(lengths, path, path2, pivot);
		if (consensus.empty()) {
			// This seed could be removed from the seed path.
			if (opt::verbose > 1)
#pragma omp critical(cout)
				cout << get(g_contigNames, seed2)
					<< '\t' << path2 << '\n'
					<< "\tinvalid\n";
		} else {
			path.swap(consensus);
			if (opt::verbose > 1)
#pragma omp critical(cout)
				cout << get(g_contigNames, seed2)
					<< '\t' << path2 << '\n'
					<< '\t' << path << '\n';
		}
		seed1 = seed2;
	}
	return path;
}
Пример #3
0
/** Read contig paths from the specified file.
 * @param g the contig adjacency graph
 * @param inPath the file of contig paths
 * @param[out] pathIDs the path IDs
 * @return the paths
 */
static Paths readPaths(Graph& g,
		const string& inPath, vector<string>& pathIDs)
{
	typedef graph_traits<Graph>::vertex_descriptor V;

	assert(pathIDs.empty());
	ifstream fin(inPath.c_str());
	if (opt::verbose > 0)
		cerr << "Reading `" << inPath << "'..." << endl;
	if (inPath != "-")
		assert_good(fin, inPath);
	istream& in = inPath == "-" ? cin : fin;

	assert_good(in, inPath);
	Paths paths;
	string id;
	ContigPath path;
	while (in >> id >> path) {
		if (path.empty()) {
			// Remove this contig from the graph.
			V u = find_vertex(id, false, g);
			clear_vertex(u, g);
			remove_vertex(u, g);
		} else {
			pathIDs.push_back(id);
			paths.push_back(path);
		}
	}
	assert(in.eof());
	return paths;
}
Пример #4
0
/** Attempt to merge the paths specified in mergeQ with path.
 * @return the number of paths merged
 */
static unsigned mergePaths(const Lengths& lengths,
		ContigPath& path,
		deque<ContigNode>& mergeQ, set<ContigNode>& seen,
		const ContigPathMap& paths)
{
	unsigned merged = 0;
	deque<ContigNode> invalid;
	for (ContigNode pivot; !mergeQ.empty(); mergeQ.pop_front()) {
		pivot = mergeQ.front();
		ContigPathMap::const_iterator path2It
			= paths.find(pivot.contigIndex());
		if (path2It == paths.end())
			continue;

		ContigPath path2 = path2It->second;
		if (pivot.sense())
			reverseComplement(path2.begin(), path2.end());
		ContigPath consensus = align(lengths, path, path2, pivot);
		if (consensus.empty()) {
			invalid.push_back(pivot);
			continue;
		}

		appendToMergeQ(mergeQ, seen, path2);
		path.swap(consensus);
		if (gDebugPrint)
#pragma omp critical(cout)
			cout << get(g_contigNames, pivot)
				<< '\t' << path2 << '\n'
				<< '\t' << path << '\n';
		merged++;
	}
	mergeQ.swap(invalid);
	return merged;
}
Пример #5
0
/** Add an edge if the two paths overlap.
 * @param pivot the pivot at which to seed the alignment
 * @return whether an overlap was found
 */
static bool addOverlapEdge(const Lengths& lengths,
		PathGraph& gout, ContigNode pivot,
		ContigNode seed1, const ContigPath& path1,
		ContigNode seed2, const ContigPath& path2)
{
	assert(seed1 != seed2);

	// Determine the orientation of the overlap edge.
	dir_type orientation = DIR_X;
	ContigPath consensus = align(lengths,
			path1, path2, pivot, orientation);
	if (consensus.empty())
		return false;
	assert(orientation != DIR_X);
	if (orientation == DIR_B) {
		// One of the paths subsumes the other. Use the order of the
		// seeds to determine the orientation of the edge.
		orientation = find(consensus.begin(), consensus.end(), seed1)
			< find(consensus.begin(), consensus.end(), seed2)
			? DIR_F : DIR_R;
	}
	assert(orientation == DIR_F || orientation == DIR_R);

	// Add the edge.
	ContigNode u = orientation == DIR_F ? seed1 : seed2;
	ContigNode v = orientation == DIR_F ? seed2 : seed1;
	bool added = false;
#pragma omp critical(gout)
	if (!edge(u, v, gout).second) {
		add_edge(u, v, gout);
		added = true;
	}
	return added;
}
Пример #6
0
/** Merge a sequence of overlapping paths. */
static ContigPath mergePaths(const Paths& paths,
		const OverlapMap& overlaps, const ContigPath& merge)
{
	assert(!merge.empty());
	ContigNode u = merge.front();
	ContigPath path(getPath(paths, u));
	for (ContigPath::const_iterator it = merge.begin() + 1;
			it != merge.end(); ++it) {
		ContigNode v = *it;
		ContigPath vpath(getPath(paths, v));
		unsigned overlap = getOverlap(overlaps, u, v);
		assert(path.size() > overlap);
		assert(vpath.size() > overlap);
		assert(equal(path.end() - overlap, path.end(),
					vpath.begin()));
		path.insert(path.end(), vpath.begin() + overlap, vpath.end());
		u = v;
	}
	return path;
}
Пример #7
0
/** Find a path for the specified distance estimates.
 * @param out [out] the solution path
 */
static void handleEstimate(const Graph& g,
		const EstimateRecord& er, bool dirIdx,
		ContigPath& out)
{
	if (er.estimates[dirIdx].empty())
		return;

	ContigNode origin(er.refID, dirIdx);
	ostringstream vout_ss;
	ostream bitBucket(NULL);
	ostream& vout = opt::verbose > 0 ? vout_ss : bitBucket;
	vout << "\n* " << get(vertex_name, g, origin) << '\n';

	unsigned minNumPairs = UINT_MAX;
	// generate the reachable set
	Constraints constraints;
	for (Estimates::const_iterator iter
				= er.estimates[dirIdx].begin();
			iter != er.estimates[dirIdx].end(); ++iter) {
		ContigNode v = iter->first;
		const DistanceEst& ep = iter->second;
		minNumPairs = min(minNumPairs, ep.numPairs);
		constraints.push_back(Constraint(v,
					ep.distance + allowedError(ep.stdDev)));
	}

	vout << "Constraints:";
	printConstraints(vout, g, constraints) << '\n';

	ContigPaths solutions;
	unsigned numVisited = 0;
	constrainedSearch(g, origin, constraints, solutions, numVisited);
	bool tooComplex = numVisited >= opt::maxCost;
	bool tooManySolutions = solutions.size() > opt::maxPaths;

	set<ContigID> repeats = findRepeats(er.refID, solutions);
	if (!repeats.empty()) {
		vout << "Repeats:";
		for (set<ContigID>::const_iterator it = repeats.begin();
				it != repeats.end(); ++it)
			vout << ' ' << get(g_contigNames, *it);
		vout << '\n';
	}

	unsigned numPossiblePaths = solutions.size();
	if (numPossiblePaths > 0)
		vout << "Paths: " << numPossiblePaths << '\n';

	for (ContigPaths::iterator solIter = solutions.begin();
			solIter != solutions.end();) {
		vout << *solIter << '\n';

		// Calculate the path distance to each node and see if
		// it is within the estimated distance.
		map<ContigNode, int> distanceMap
			= makeDistanceMap(g, origin, *solIter);

		// Remove solutions whose distance estimates are not correct.
		unsigned validCount = 0, invalidCount = 0, ignoredCount = 0;
		for (Estimates::const_iterator iter
					= er.estimates[dirIdx].begin();
				iter != er.estimates[dirIdx].end(); ++iter) {
			ContigNode v = iter->first;
			const DistanceEst& ep = iter->second;
			vout << get(vertex_name, g, v) << ',' << ep << '\t';

			map<ContigNode, int>::iterator dmIter
				= distanceMap.find(v);
			if (dmIter == distanceMap.end()) {
				// This contig is a repeat.
				ignoredCount++;
				vout << "ignored\n";
				continue;
			}

			// translate distance by -overlap to match
			// coordinate space used by the estimate
			int actualDistance = dmIter->second;
			int diff = actualDistance - ep.distance;
			unsigned buffer = allowedError(ep.stdDev);
			bool invalid = (unsigned)abs(diff) > buffer;
			bool repeat = repeats.count(v.contigIndex()) > 0;
			bool ignored = invalid && repeat;
			if (ignored)
				ignoredCount++;
			else if (invalid)
				invalidCount++;
			else
				validCount++;
			vout << "dist: " << actualDistance
				<< " diff: " << diff
				<< " buffer: " << buffer
				<< " n: " << ep.numPairs
				<< (ignored ? " ignored" : invalid ? " invalid" : "")
				<< '\n';
		}

		if (invalidCount == 0 && validCount > 0)
			++solIter;
		else
			solIter = solutions.erase(solIter);
	}

	vout << "Solutions: " << solutions.size();
	if (tooComplex)
		vout << " (too complex)";
	if (tooManySolutions)
		vout << " (too many solutions)";
	vout << '\n';

	ContigPaths::iterator bestSol = solutions.end();
	int minDiff = 999999;
	for (ContigPaths::iterator solIter = solutions.begin();
			solIter != solutions.end(); ++solIter) {
		map<ContigNode, int> distanceMap
			= makeDistanceMap(g, origin, *solIter);
		int sumDiff = 0;
		for (Estimates::const_iterator iter
					= er.estimates[dirIdx].begin();
				iter != er.estimates[dirIdx].end(); ++iter) {
			ContigNode v = iter->first;
			const DistanceEst& ep = iter->second;
			if (repeats.count(v.contigIndex()) > 0)
				continue;
			map<ContigNode, int>::iterator dmIter
				= distanceMap.find(v);
			assert(dmIter != distanceMap.end());
			int actualDistance = dmIter->second;
			int diff = actualDistance - ep.distance;
			sumDiff += abs(diff);
		}

		if (sumDiff < minDiff) {
			minDiff = sumDiff;
			bestSol = solIter;
		}

		vout << *solIter
			<< " length: " << calculatePathLength(g, origin, *solIter)
			<< " sumdiff: " << sumDiff << '\n';
	}

	/** Lock the debugging stream. */
	static pthread_mutex_t coutMutex = PTHREAD_MUTEX_INITIALIZER;
	pthread_mutex_lock(&coutMutex);
	stats.totalAttempted++;
	g_minNumPairs = min(g_minNumPairs, minNumPairs);

	if (tooComplex) {
		stats.tooComplex++;
	} else if (tooManySolutions) {
		stats.tooManySolutions++;
	} else if (numPossiblePaths == 0) {
		stats.noPossiblePaths++;
	} else if (solutions.empty()) {
		stats.noValidPaths++;
	} else if (repeats.count(er.refID) > 0) {
		vout << "Repeat: " << get(vertex_name, g, origin) << '\n';
		stats.repeat++;
	} else if (solutions.size() > 1) {
		ContigPath path
			= constructAmbiguousPath(g, origin, solutions);
		if (!path.empty()) {
			if (opt::extend)
				extend(g, path.back(), back_inserter(path));
			vout << path << '\n';
			if (opt::scaffold) {
				out.insert(out.end(), path.begin(), path.end());
				g_minNumPairsUsed
					= min(g_minNumPairsUsed, minNumPairs);
			}
		}
		stats.multiEnd++;
	} else {
		assert(solutions.size() == 1);
		assert(bestSol != solutions.end());
		ContigPath& path = *bestSol;
		if (opt::verbose > 1)
			printDistanceMap(vout, g, origin, path);
		if (opt::extend)
			extend(g, path.back(), back_inserter(path));
		out.insert(out.end(), path.begin(), path.end());
		stats.uniqueEnd++;
		g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs);
	}
	cout << vout_ss.str();
	if (!out.empty())
		assert(!out.back().ambiguous());
	pthread_mutex_unlock(&coutMutex);
}
Пример #8
0
/** Identify paths subsumed by the specified path.
 * @param overlaps [out] paths that are found to overlap
 * @return the ID of the subsuming path
 */
static ContigID identifySubsumedPaths(const Lengths& lengths,
		ContigPathMap::const_iterator path1It,
		ContigPathMap& paths,
		set<ContigID>& out,
		set<ContigID>& overlaps)
{
	ostringstream vout;
	out.clear();
	ContigID id(path1It->first);
	const ContigPath& path = path1It->second;
	if (gDebugPrint)
		vout << get(g_contigNames, ContigNode(id, false))
			<< '\t' << path << '\n';

	for (ContigPath::const_iterator it = path.begin();
			it != path.end(); ++it) {
		ContigNode pivot = *it;
		if (pivot.ambiguous() || pivot.id() == id)
			continue;
		ContigPathMap::iterator path2It
			= paths.find(pivot.contigIndex());
		if (path2It == paths.end())
			continue;
		ContigPath path2 = path2It->second;
		if (pivot.sense())
			reverseComplement(path2.begin(), path2.end());
		ContigPath consensus = align(lengths, path, path2, pivot);
		if (consensus.empty())
			continue;
		if (equalIgnoreAmbiguos(consensus, path)) {
			if (gDebugPrint)
				vout << get(g_contigNames, pivot)
					<< '\t' << path2 << '\n';
			out.insert(path2It->first);
		} else if (equalIgnoreAmbiguos(consensus, path2)) {
			// This path is larger. Use it as the seed.
			return identifySubsumedPaths(lengths, path2It, paths, out,
					overlaps);
		} else if (isCycle(lengths, consensus)) {
			// The consensus path is a cycle.
			bool isCyclePath1 = isCycle(lengths, path);
			bool isCyclePath2 = isCycle(lengths, path2);
			if (!isCyclePath1 && !isCyclePath2) {
				// Neither path is a cycle.
				if (gDebugPrint)
					vout << get(g_contigNames, pivot)
						<< '\t' << path2 << '\n'
						<< "ignored\t" << consensus << '\n';
				overlaps.insert(id);
				overlaps.insert(path2It->first);
			} else {
				// At least one path is a cycle.
				if (gDebugPrint)
					vout << get(g_contigNames, pivot)
						<< '\t' << path2 << '\n'
						<< "cycle\t" << consensus << '\n';
				if (isCyclePath1 && isCyclePath2)
					out.insert(path2It->first);
				else if (!isCyclePath1)
					overlaps.insert(id);
				else if (!isCyclePath2)
					overlaps.insert(path2It->first);
			}
		} else {
			if (gDebugPrint)
				vout << get(g_contigNames, pivot)
					<< '\t' << path2 << '\n'
					<< "ignored\t" << consensus << '\n';
			overlaps.insert(id);
			overlaps.insert(path2It->first);
		}
	}
	cout << vout.str();
	return id;
}
Пример #9
0
/** Find an equivalent region of the two specified paths.
 * @param[out] orientation the orientation of the alignment
 * @return the consensus sequence
 */
static ContigPath align(const Lengths& lengths,
		const ContigPath& path1, const ContigPath& path2,
		ContigNode pivot, dir_type& orientation)
{
	if (&path1 == &path2) {
		// Ignore the trivial alignment when aligning a path to
		// itself.
	} else if (path1 == path2) {
		// These two paths are identical.
		orientation = DIR_B;
		return path1;
	} else {
		ContigPath::const_iterator it
			= search(path1.begin(), path1.end(),
				path2.begin(), path2.end());
		if (it != path1.end()) {
			// path2 is subsumed in path1.
			// Determine the orientation of the edge.
			orientation
				= it == path1.begin() ? DIR_R
				: it + path2.size() == path1.end() ? DIR_F
				: DIR_B;
			return path1;
		}
	}

	// Find a suitable pivot.
	if (find(path1.begin(), path1.end(), pivot) == path1.end()
			|| find(path2.begin(), path2.end(), pivot)
				== path2.end()) {
		bool good;
		tie(pivot, good) = findPivot(path1, path2);
		if (!good)
			return ContigPath();
	}
	assert(find(path1.begin(), path1.end(), pivot) != path1.end());

	ContigPath::const_iterator it2 = find(path2.begin(), path2.end(),
			pivot);
	assert(it2 != path2.end());
	if (&path1 != &path2) {
		// The seed must be unique in path2, unless we're aligning a
		// path to itself.
		assert(count(it2+1, path2.end(), pivot) == 0);
	}

	ContigPath consensus;
	for (ContigPath::const_iterator it1 = find_if(
				path1.begin(), path1.end(),
				bind2nd(equal_to<ContigNode>(), pivot));
			it1 != path1.end();
			it1 = find_if(it1+1, path1.end(),
				bind2nd(equal_to<ContigNode>(), pivot))) {
		if (&*it1 == &*it2) {
			// We are aligning a path to itself, and this is the
			// trivial alignment, which we'll ignore.
			continue;
		}
		consensus = align(lengths,
				path1, path2, it1, it2, orientation);
		if (!consensus.empty())
			return consensus;
	}
	return consensus;
}
Пример #10
0
/** Return the consensus sequence of the specified gap. */
static ContigPath fillGap(const Graph& g,
		const AmbPathConstraint& apConstraint,
		vector<bool>& seen,
		ofstream& outFasta)
{
	if (opt::verbose > 1)
		cerr << "\n* "
			<< get(vertex_name, g, apConstraint.source) << ' '
			<< apConstraint.dist << "N "
			<< get(vertex_name, g, apConstraint.dest) << '\n';

	Constraints constraints;
	constraints.push_back(Constraint(apConstraint.dest,
				apConstraint.dist + opt::distanceError));

	ContigPaths solutions;
	unsigned numVisited = 0;
	constrainedSearch(g, apConstraint.source,
			constraints, solutions, numVisited);
	bool tooComplex = numVisited >= opt::maxCost;

	for (ContigPaths::iterator solIt = solutions.begin();
			solIt != solutions.end(); solIt++)
		solIt->insert(solIt->begin(), apConstraint.source);

	ContigPath consensus;
	bool tooManySolutions = solutions.size() > opt::numBranches;
	if (tooComplex) {
		stats.tooComplex++;
		if (opt::verbose > 1)
			cerr << solutions.size() << " paths (too complex)\n";
	} else if (tooManySolutions) {
		stats.numTooManySolutions++;
		if (opt::verbose > 1)
			cerr << solutions.size() << " paths (too many)\n";
	} else if (solutions.empty()) {
		stats.numNoSolutions++;
		if (opt::verbose > 1)
			cerr << "no paths\n";
	} else if (solutions.size() == 1) {
		if (opt::verbose > 1)
			cerr << "1 path\n" << solutions.front() << '\n';
		stats.numMerged++;
	} else {
		assert(solutions.size() > 1);
		if (opt::verbose > 2)
			copy(solutions.begin(), solutions.end(),
					ostream_iterator<ContigPath>(cerr, "\n"));
		else if (opt::verbose > 1)
			cerr << solutions.size() << " paths\n";
		consensus = align(g, solutions, outFasta);
		if (!consensus.empty()) {
			stats.numMerged++;
			// Mark contigs that are used in a consensus.
			markSeen(seen, solutions, true);
			if (opt::verbose > 1)
				cerr << consensus << '\n';
		} else
			stats.notMerged++;
	}
	return consensus;
}