Пример #1
0
/** Find every path that overlaps with the specified path. */
static void findOverlaps(const Graph& g,
		const Paths& paths, const SeedMap& seedMap,
		const Vertex& v, Overlaps& overlaps)
{
	ContigPath rc;
	if (v.sense) {
		rc = paths[v.id];
		reverseComplement(rc.begin(), rc.end());
	}
	const ContigPath& path = v.sense ? rc : paths[v.id];

	for (ContigPath::const_iterator it = path.begin();
			it != path.end(); ++it) {
		if (it->ambiguous())
			continue;

		pair<SeedMap::const_iterator, SeedMap::const_iterator>
			range = seedMap.equal_range(*it);
		for (SeedMap::const_iterator seed = range.first;
				seed != range.second; ++seed) {
			if (v == seed->second)
				continue;
			int distance = 0;
			unsigned overlap = findOverlap(g, paths, it, path.end(),
					   seed->second, distance);
			if (overlap > 0)
				overlaps.push_back(Overlap(v, seed->second,
					overlap, distance));

		}
	}
}
Пример #2
0
/** Read contig paths from the specified file.
 * @param g the contig adjacency graph
 * @param inPath the file of contig paths
 * @param[out] pathIDs the path IDs
 * @return the paths
 */
static Paths readPaths(Graph& g,
		const string& inPath, vector<string>& pathIDs)
{
	typedef graph_traits<Graph>::vertex_descriptor V;

	assert(pathIDs.empty());
	ifstream fin(inPath.c_str());
	if (opt::verbose > 0)
		cerr << "Reading `" << inPath << "'..." << endl;
	if (inPath != "-")
		assert_good(fin, inPath);
	istream& in = inPath == "-" ? cin : fin;

	assert_good(in, inPath);
	Paths paths;
	string id;
	ContigPath path;
	while (in >> id >> path) {
		if (path.empty()) {
			// Remove this contig from the graph.
			V u = find_vertex(id, false, g);
			clear_vertex(u, g);
			remove_vertex(u, g);
		} else {
			pathIDs.push_back(id);
			paths.push_back(path);
		}
	}
	assert(in.eof());
	return paths;
}
Пример #3
0
/** Attempt to merge the paths specified in mergeQ with path.
 * @return the number of paths merged
 */
static unsigned mergePaths(const Lengths& lengths,
		ContigPath& path,
		deque<ContigNode>& mergeQ, set<ContigNode>& seen,
		const ContigPathMap& paths)
{
	unsigned merged = 0;
	deque<ContigNode> invalid;
	for (ContigNode pivot; !mergeQ.empty(); mergeQ.pop_front()) {
		pivot = mergeQ.front();
		ContigPathMap::const_iterator path2It
			= paths.find(pivot.contigIndex());
		if (path2It == paths.end())
			continue;

		ContigPath path2 = path2It->second;
		if (pivot.sense())
			reverseComplement(path2.begin(), path2.end());
		ContigPath consensus = align(lengths, path, path2, pivot);
		if (consensus.empty()) {
			invalid.push_back(pivot);
			continue;
		}

		appendToMergeQ(mergeQ, seen, path2);
		path.swap(consensus);
		if (gDebugPrint)
#pragma omp critical(cout)
			cout << get(g_contigNames, pivot)
				<< '\t' << path2 << '\n'
				<< '\t' << path << '\n';
		merged++;
	}
	mergeQ.swap(invalid);
	return merged;
}
Пример #4
0
/** Return a map of contig IDs to their distance along this path.
 * Repeat contigs, which would have more than one position, are not
 * represented in this map.
 */
map<ContigNode, int> makeDistanceMap(const Graph& g,
		const ContigNode& origin, const ContigPath& path)
{
	map<ContigNode, int> distances;
	int distance = 0;
	for (ContigPath::const_iterator it = path.begin();
			it != path.end(); ++it) {
		vertex_descriptor u = it == path.begin() ? origin : *(it - 1);
		vertex_descriptor v = *it;
		distance += getDistance(g, u, v);

		bool inserted = distances.insert(
				make_pair(v, distance)).second;
		if (!inserted) {
			// Mark this contig as a repeat.
			distances[v] = INT_MIN;
		}

		distance += g[v].length;
	}

	// Remove the repeats.
	for (map<ContigNode, int>::iterator it = distances.begin();
			it != distances.end();)
		if (it->second == INT_MIN)
			distances.erase(it++);
		else
			++it;
	return distances;
}
Пример #5
0
/** Remove ambiguous edges that overlap by only a small amount.
 * Remove the edge (u,v) if deg+(u) > 1 and deg-(v) > 1 and the
 * overlap of (u,v) is small.
 */
static void removeSmallOverlaps(PathGraph& g,
		const ContigPathMap& paths)
{
	typedef graph_traits<PathGraph>::edge_descriptor E;
	typedef graph_traits<PathGraph>::out_edge_iterator Eit;
	typedef graph_traits<PathGraph>::vertex_descriptor V;
	typedef graph_traits<PathGraph>::vertex_iterator Vit;

	vector<E> edges;
	pair<Vit, Vit> urange = vertices(g);
	for (Vit uit = urange.first; uit != urange.second; ++uit) {
		V u = *uit;
		if (out_degree(u, g) < 2)
			continue;
		ContigPath pathu = getPath(paths, u);
		pair<Eit, Eit> uvits = out_edges(u, g);
		for (Eit uvit = uvits.first; uvit != uvits.second; ++uvit) {
			E uv = *uvit;
			V v = target(uv, g);
			assert(v != u);
			if (in_degree(v, g) < 2)
				continue;
			ContigPath pathv = getPath(paths, v);
			if (pathu.back() == pathv.front()
					&& paths.count(pathu.back().contigIndex()) > 0)
				edges.push_back(uv);
		}
	}
	remove_edges(g, edges.begin(), edges.end());
	if (opt::verbose > 0)
		cout << "Removed " << edges.size()
			<< " small overlap edges.\n";
	if (!opt::db.empty())
		addToDb(db, "Edges_removed_small_overlap", edges.size());
}
Пример #6
0
static void appendToMergeQ(deque<ContigNode>& mergeQ,
	set<ContigNode>& seen, const ContigPath& path)
{
	for (ContigPath::const_iterator it = path.begin();
			it != path.end(); ++it)
		if (!it->ambiguous() && seen.insert(*it).second)
			mergeQ.push_back(*it);
}
Пример #7
0
/** Mark every contig in path as seen. */
static void markSeen(vector<bool>& seen, const ContigPath& path,
		bool flag)
{
	for (Path::const_iterator it = path.begin();
			it != path.end(); ++it)
		if (!it->ambiguous() && it->id() < seen.size())
			seen[it->id()] = flag;
}
Пример #8
0
/** Return the specified path. */
static ContigPath getPath(const ContigPathMap& paths, ContigNode u)
{
	ContigPathMap::const_iterator it = paths.find(u.contigIndex());
	assert(it != paths.end());
	ContigPath path = it->second;
	if (u.sense())
		reverseComplement(path.begin(), path.end());
	return path;
}
Пример #9
0
/** Check whether path starts with the sequence [first, last). */
static bool startsWith(ContigPath path, bool rc,
		ContigPath::const_iterator first,
		ContigPath::const_iterator last)
{
	if (rc)
		reverseComplement(path.begin(), path.end());
	assert(*first == path.front());
	assert(first < last);
	return unsigned(last - first) > path.size() ? false
		: equal(first, last, path.begin());
}
Пример #10
0
/** Return a FASTA comment for the specified path. */
static void pathToComment(ostream& out,
		const Graph& g, const ContigPath& path)
{
	assert(path.size() > 1);
	out << get(vertex_name, g, path.front());
	if (path.size() == 3)
		out << ',' << get(vertex_name, g, path[1]);
	else if (path.size() > 3)
		out << ",...";
	out << ',' << get(vertex_name, g, path.back());
}
Пример #11
0
/** Remove ambiguous contigs from the ends of the path. */
static void removeAmbiguousContigs(ContigPath& path)
{
	if (!path.empty() && path.back().ambiguous())
		path.erase(path.end() - 1);
	if (!path.empty() && path.front().ambiguous())
		path.erase(path.begin());
}
Пример #12
0
/** Return the length of the specified path in k-mer. */
static unsigned calculatePathLength(const Graph& g,
		const ContigNode& origin,
		const ContigPath& path, size_t prefix = 0, size_t suffix = 0)
{
	if (prefix + suffix == path.size())
		return 0;
	assert(prefix + suffix < path.size());
	int length = addProp(g, path.begin() + prefix,
			path.end() - suffix).length;

	// Account for the overlap on the left.
	vertex_descriptor u = prefix == 0 ? origin : path[prefix - 1];
	length += getDistance(g, u, path[prefix]);
	assert(length > 0);
	return length;
}
Пример #13
0
/** Merge a sequence of overlapping paths. */
static ContigPath mergePaths(const Paths& paths,
		const OverlapMap& overlaps, const ContigPath& merge)
{
	assert(!merge.empty());
	ContigNode u = merge.front();
	ContigPath path(getPath(paths, u));
	for (ContigPath::const_iterator it = merge.begin() + 1;
			it != merge.end(); ++it) {
		ContigNode v = *it;
		ContigPath vpath(getPath(paths, v));
		unsigned overlap = getOverlap(overlaps, u, v);
		assert(path.size() > overlap);
		assert(vpath.size() > overlap);
		assert(equal(path.end() - overlap, path.end(),
					vpath.begin()));
		path.insert(path.end(), vpath.begin() + overlap, vpath.end());
		u = v;
	}
	return path;
}
Пример #14
0
/** Merge the specified path. */
static Contig mergePath(const Graph& g, const Contigs& contigs,
		const ContigPath& path)
{
	Sequence seq;
	unsigned coverage = 0;
	for (ContigPath::const_iterator it = path.begin();
			it != path.end(); ++it) {
		if (!it->ambiguous())
			coverage += g[*it].coverage;
		if (seq.empty()) {
			seq = sequence(contigs, *it);
		} else {
			assert(it != path.begin());
			mergeContigs(g, contigs, *(it-1), *it, seq, path);
		}
	}
	ostringstream ss;
	ss << seq.size() << ' ' << coverage << ' ';
	pathToComment(ss, g, path);
	return Contig(ss.str(), seq);
}
Пример #15
0
/** Add an edge if the two paths overlap.
 * @param pivot the pivot at which to seed the alignment
 * @return whether an overlap was found
 */
static bool addOverlapEdge(const Lengths& lengths,
		PathGraph& gout, ContigNode pivot,
		ContigNode seed1, const ContigPath& path1,
		ContigNode seed2, const ContigPath& path2)
{
	assert(seed1 != seed2);

	// Determine the orientation of the overlap edge.
	dir_type orientation = DIR_X;
	ContigPath consensus = align(lengths,
			path1, path2, pivot, orientation);
	if (consensus.empty())
		return false;
	assert(orientation != DIR_X);
	if (orientation == DIR_B) {
		// One of the paths subsumes the other. Use the order of the
		// seeds to determine the orientation of the edge.
		orientation = find(consensus.begin(), consensus.end(), seed1)
			< find(consensus.begin(), consensus.end(), seed2)
			? DIR_F : DIR_R;
	}
	assert(orientation == DIR_F || orientation == DIR_R);

	// Add the edge.
	ContigNode u = orientation == DIR_F ? seed1 : seed2;
	ContigNode v = orientation == DIR_F ? seed2 : seed1;
	bool added = false;
#pragma omp critical(gout)
	if (!edge(u, v, gout).second) {
		add_edge(u, v, gout);
		added = true;
	}
	return added;
}
Пример #16
0
/** Merge the paths of the specified seed path.
 * @return the merged contig path
 */
static ContigPath mergePath(const Lengths& lengths,
		const ContigPathMap& paths, const ContigPath& seedPath)
{
	assert(!seedPath.empty());
	ContigNode seed1 = seedPath.front();
	ContigPathMap::const_iterator path1It
		= paths.find(seed1.contigIndex());
	assert(path1It != paths.end());
	ContigPath path(path1It->second);
	if (seedPath.front().sense())
		reverseComplement(path.begin(), path.end());
	if (opt::verbose > 1)
#pragma omp critical(cout)
		cout << "\n* " << seedPath << '\n'
			<< get(g_contigNames, seedPath.front())
			<< '\t' << path << '\n';
	for (ContigPath::const_iterator it = seedPath.begin() + 1;
			it != seedPath.end(); ++it) {
		ContigNode seed2 = *it;
		ContigPathMap::const_iterator path2It
			= paths.find(seed2.contigIndex());
		assert(path2It != paths.end());
		ContigPath path2 = path2It->second;
		if (seed2.sense())
			reverseComplement(path2.begin(), path2.end());

		ContigNode pivot
			= find(path.begin(), path.end(), seed2) != path.end()
			? seed2 : seed1;
		ContigPath consensus = align(lengths, path, path2, pivot);
		if (consensus.empty()) {
			// This seed could be removed from the seed path.
			if (opt::verbose > 1)
#pragma omp critical(cout)
				cout << get(g_contigNames, seed2)
					<< '\t' << path2 << '\n'
					<< "\tinvalid\n";
		} else {
			path.swap(consensus);
			if (opt::verbose > 1)
#pragma omp critical(cout)
				cout << get(g_contigNames, seed2)
					<< '\t' << path2 << '\n'
					<< '\t' << path << '\n';
		}
		seed1 = seed2;
	}
	return path;
}
Пример #17
0
static void* worker(void* pArg)
{
	WorkerArg& arg = *static_cast<WorkerArg*>(pArg);
	for (;;) {
		/** Lock the input stream. */
		static pthread_mutex_t inMutex = PTHREAD_MUTEX_INITIALIZER;
		pthread_mutex_lock(&inMutex);
		EstimateRecord er;
		bool good = (*arg.in) >> er;
		pthread_mutex_unlock(&inMutex);
		if (!good)
			break;

		// Flip the anterior distance estimates.
		for (Estimates::iterator it = er.estimates[1].begin();
				it != er.estimates[1].end(); ++it)
			it->first ^= 1;

		ContigPath path;
		handleEstimate(*arg.graph, er, true, path);
		reverseComplement(path.begin(), path.end());
		path.push_back(ContigNode(er.refID, false));
		handleEstimate(*arg.graph, er, false, path);
		if (path.size() > 1) {
			/** Lock the output stream. */
			static pthread_mutex_t outMutex
				= PTHREAD_MUTEX_INITIALIZER;
			pthread_mutex_lock(&outMutex);
			*arg.out << get(g_contigNames, er.refID)
				<< '\t' << path << '\n';
			assert(arg.out->good());
			pthread_mutex_unlock(&outMutex);
		}
	}
	return NULL;
}
Пример #18
0
/** Find the overlaps between paths and add edges to the graph. */
static void findPathOverlaps(const Lengths& lengths,
		const ContigPathMap& paths,
		const ContigNode& seed1, const ContigPath& path1,
		PathGraph& gout)
{
	for (ContigPath::const_iterator it = path1.begin();
			it != path1.end(); ++it) {
		ContigNode seed2 = *it;
		if (seed1 == seed2)
			continue;
		if (seed2.ambiguous())
			continue;
		ContigPathMap::const_iterator path2It
			= paths.find(seed2.contigIndex());
		if (path2It == paths.end())
			continue;

		ContigPath path2 = path2It->second;
		if (seed2.sense())
			reverseComplement(path2.begin(), path2.end());
		addOverlapEdge(lengths,
				gout, seed2, seed1, path1, seed2, path2);
	}
}
Пример #19
0
/** Add distances to a path. */
static ContigPath addDistance(const Graph& g, const ContigPath& path)
{
	ContigPath out;
	out.reserve(path.size());
	ContigNode u = path.front();
	out.push_back(u);
	for (ContigPath::const_iterator it = path.begin() + 1;
			it != path.end(); ++it) {
		ContigNode v = *it;
		int distance = getDistance(g, u, v);
		if (distance >= 0) {
			int numN = distance + opt::k - 1; // by convention
			assert(numN >= 0);
			numN = max(numN, 1);
			out.push_back(ContigNode(numN, 'N'));
		}
		out.push_back(v);
		u = v;
	}
	return out;
}
Пример #20
0
/** Return a pivot suitable for aligning the two paths if one exists,
 * otherwise return false.
 */
static pair<ContigNode, bool> findPivot(
		const ContigPath& path1, const ContigPath& path2)
{
	for (ContigPath::const_iterator it = path2.begin();
			it != path2.end(); ++it) {
		if (it->ambiguous())
			continue;
		if (count(path2.begin(), path2.end(), *it) == 1
				&& count(path1.begin(), path1.end(), *it) == 1)
			return make_pair(*it, true);
	}
	return make_pair(ContigNode(0), false);
}
Пример #21
0
/** Return the consensus sequence of the specified gap. */
static ContigPath fillGap(const Graph& g,
		const AmbPathConstraint& apConstraint,
		vector<bool>& seen,
		ofstream& outFasta)
{
	if (opt::verbose > 1)
		cerr << "\n* "
			<< get(vertex_name, g, apConstraint.source) << ' '
			<< apConstraint.dist << "N "
			<< get(vertex_name, g, apConstraint.dest) << '\n';

	Constraints constraints;
	constraints.push_back(Constraint(apConstraint.dest,
				apConstraint.dist + opt::distanceError));

	ContigPaths solutions;
	unsigned numVisited = 0;
	constrainedSearch(g, apConstraint.source,
			constraints, solutions, numVisited);
	bool tooComplex = numVisited >= opt::maxCost;

	for (ContigPaths::iterator solIt = solutions.begin();
			solIt != solutions.end(); solIt++)
		solIt->insert(solIt->begin(), apConstraint.source);

	ContigPath consensus;
	bool tooManySolutions = solutions.size() > opt::numBranches;
	if (tooComplex) {
		stats.tooComplex++;
		if (opt::verbose > 1)
			cerr << solutions.size() << " paths (too complex)\n";
	} else if (tooManySolutions) {
		stats.numTooManySolutions++;
		if (opt::verbose > 1)
			cerr << solutions.size() << " paths (too many)\n";
	} else if (solutions.empty()) {
		stats.numNoSolutions++;
		if (opt::verbose > 1)
			cerr << "no paths\n";
	} else if (solutions.size() == 1) {
		if (opt::verbose > 1)
			cerr << "1 path\n" << solutions.front() << '\n';
		stats.numMerged++;
	} else {
		assert(solutions.size() > 1);
		if (opt::verbose > 2)
			copy(solutions.begin(), solutions.end(),
					ostream_iterator<ContigPath>(cerr, "\n"));
		else if (opt::verbose > 1)
			cerr << solutions.size() << " paths\n";
		consensus = align(g, solutions, outFasta);
		if (!consensus.empty()) {
			stats.numMerged++;
			// Mark contigs that are used in a consensus.
			markSeen(seen, solutions, true);
			if (opt::verbose > 1)
				cerr << consensus << '\n';
		} else
			stats.notMerged++;
	}
	return consensus;
}
Пример #22
0
/** Find a path for the specified distance estimates.
 * @param out [out] the solution path
 */
static void handleEstimate(const Graph& g,
		const EstimateRecord& er, bool dirIdx,
		ContigPath& out)
{
	if (er.estimates[dirIdx].empty())
		return;

	ContigNode origin(er.refID, dirIdx);
	ostringstream vout_ss;
	ostream bitBucket(NULL);
	ostream& vout = opt::verbose > 0 ? vout_ss : bitBucket;
	vout << "\n* " << get(vertex_name, g, origin) << '\n';

	unsigned minNumPairs = UINT_MAX;
	// generate the reachable set
	Constraints constraints;
	for (Estimates::const_iterator iter
				= er.estimates[dirIdx].begin();
			iter != er.estimates[dirIdx].end(); ++iter) {
		ContigNode v = iter->first;
		const DistanceEst& ep = iter->second;
		minNumPairs = min(minNumPairs, ep.numPairs);
		constraints.push_back(Constraint(v,
					ep.distance + allowedError(ep.stdDev)));
	}

	vout << "Constraints:";
	printConstraints(vout, g, constraints) << '\n';

	ContigPaths solutions;
	unsigned numVisited = 0;
	constrainedSearch(g, origin, constraints, solutions, numVisited);
	bool tooComplex = numVisited >= opt::maxCost;
	bool tooManySolutions = solutions.size() > opt::maxPaths;

	set<ContigID> repeats = findRepeats(er.refID, solutions);
	if (!repeats.empty()) {
		vout << "Repeats:";
		for (set<ContigID>::const_iterator it = repeats.begin();
				it != repeats.end(); ++it)
			vout << ' ' << get(g_contigNames, *it);
		vout << '\n';
	}

	unsigned numPossiblePaths = solutions.size();
	if (numPossiblePaths > 0)
		vout << "Paths: " << numPossiblePaths << '\n';

	for (ContigPaths::iterator solIter = solutions.begin();
			solIter != solutions.end();) {
		vout << *solIter << '\n';

		// Calculate the path distance to each node and see if
		// it is within the estimated distance.
		map<ContigNode, int> distanceMap
			= makeDistanceMap(g, origin, *solIter);

		// Remove solutions whose distance estimates are not correct.
		unsigned validCount = 0, invalidCount = 0, ignoredCount = 0;
		for (Estimates::const_iterator iter
					= er.estimates[dirIdx].begin();
				iter != er.estimates[dirIdx].end(); ++iter) {
			ContigNode v = iter->first;
			const DistanceEst& ep = iter->second;
			vout << get(vertex_name, g, v) << ',' << ep << '\t';

			map<ContigNode, int>::iterator dmIter
				= distanceMap.find(v);
			if (dmIter == distanceMap.end()) {
				// This contig is a repeat.
				ignoredCount++;
				vout << "ignored\n";
				continue;
			}

			// translate distance by -overlap to match
			// coordinate space used by the estimate
			int actualDistance = dmIter->second;
			int diff = actualDistance - ep.distance;
			unsigned buffer = allowedError(ep.stdDev);
			bool invalid = (unsigned)abs(diff) > buffer;
			bool repeat = repeats.count(v.contigIndex()) > 0;
			bool ignored = invalid && repeat;
			if (ignored)
				ignoredCount++;
			else if (invalid)
				invalidCount++;
			else
				validCount++;
			vout << "dist: " << actualDistance
				<< " diff: " << diff
				<< " buffer: " << buffer
				<< " n: " << ep.numPairs
				<< (ignored ? " ignored" : invalid ? " invalid" : "")
				<< '\n';
		}

		if (invalidCount == 0 && validCount > 0)
			++solIter;
		else
			solIter = solutions.erase(solIter);
	}

	vout << "Solutions: " << solutions.size();
	if (tooComplex)
		vout << " (too complex)";
	if (tooManySolutions)
		vout << " (too many solutions)";
	vout << '\n';

	ContigPaths::iterator bestSol = solutions.end();
	int minDiff = 999999;
	for (ContigPaths::iterator solIter = solutions.begin();
			solIter != solutions.end(); ++solIter) {
		map<ContigNode, int> distanceMap
			= makeDistanceMap(g, origin, *solIter);
		int sumDiff = 0;
		for (Estimates::const_iterator iter
					= er.estimates[dirIdx].begin();
				iter != er.estimates[dirIdx].end(); ++iter) {
			ContigNode v = iter->first;
			const DistanceEst& ep = iter->second;
			if (repeats.count(v.contigIndex()) > 0)
				continue;
			map<ContigNode, int>::iterator dmIter
				= distanceMap.find(v);
			assert(dmIter != distanceMap.end());
			int actualDistance = dmIter->second;
			int diff = actualDistance - ep.distance;
			sumDiff += abs(diff);
		}

		if (sumDiff < minDiff) {
			minDiff = sumDiff;
			bestSol = solIter;
		}

		vout << *solIter
			<< " length: " << calculatePathLength(g, origin, *solIter)
			<< " sumdiff: " << sumDiff << '\n';
	}

	/** Lock the debugging stream. */
	static pthread_mutex_t coutMutex = PTHREAD_MUTEX_INITIALIZER;
	pthread_mutex_lock(&coutMutex);
	stats.totalAttempted++;
	g_minNumPairs = min(g_minNumPairs, minNumPairs);

	if (tooComplex) {
		stats.tooComplex++;
	} else if (tooManySolutions) {
		stats.tooManySolutions++;
	} else if (numPossiblePaths == 0) {
		stats.noPossiblePaths++;
	} else if (solutions.empty()) {
		stats.noValidPaths++;
	} else if (repeats.count(er.refID) > 0) {
		vout << "Repeat: " << get(vertex_name, g, origin) << '\n';
		stats.repeat++;
	} else if (solutions.size() > 1) {
		ContigPath path
			= constructAmbiguousPath(g, origin, solutions);
		if (!path.empty()) {
			if (opt::extend)
				extend(g, path.back(), back_inserter(path));
			vout << path << '\n';
			if (opt::scaffold) {
				out.insert(out.end(), path.begin(), path.end());
				g_minNumPairsUsed
					= min(g_minNumPairsUsed, minNumPairs);
			}
		}
		stats.multiEnd++;
	} else {
		assert(solutions.size() == 1);
		assert(bestSol != solutions.end());
		ContigPath& path = *bestSol;
		if (opt::verbose > 1)
			printDistanceMap(vout, g, origin, path);
		if (opt::extend)
			extend(g, path.back(), back_inserter(path));
		out.insert(out.end(), path.begin(), path.end());
		stats.uniqueEnd++;
		g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs);
	}
	cout << vout_ss.str();
	if (!out.empty())
		assert(!out.back().ambiguous());
	pthread_mutex_unlock(&coutMutex);
}
Пример #23
0
/** Find an equivalent region of the two specified paths, starting the
 * alignment at pivot1 of path1 and pivot2 of path2.
 * @param[out] orientation the orientation of the alignment
 * @return the consensus sequence
 */
static ContigPath align(const Lengths& lengths,
		const ContigPath& p1, const ContigPath& p2,
		ContigPath::const_iterator pivot1,
		ContigPath::const_iterator pivot2,
		dir_type& orientation)
{
	assert(*pivot1 == *pivot2);
	ContigPath::const_reverse_iterator
		rit1 = ContigPath::const_reverse_iterator(pivot1+1),
		rit2 = ContigPath::const_reverse_iterator(pivot2+1);
	ContigPath alignmentr(p1.rend() - rit1 + p2.rend() - rit2);
	ContigPath::iterator rout = alignmentr.begin();
	dir_type alignedr = align(lengths,
			rit1, p1.rend(), rit2, p2.rend(), rout);
	alignmentr.erase(rout, alignmentr.end());

	ContigPath::const_iterator it1 = pivot1, it2 = pivot2;
	ContigPath alignmentf(p1.end() - it1 + p2.end() - it2);
	ContigPath::iterator fout = alignmentf.begin();
	dir_type alignedf = align(lengths,
			it1, p1.end(), it2, p2.end(), fout);
	alignmentf.erase(fout, alignmentf.end());

	ContigPath consensus;
	if (alignedr != DIR_X && alignedf != DIR_X) {
		// Found an alignment.
		assert(!alignmentf.empty());
		assert(!alignmentr.empty());
		consensus.reserve(alignmentr.size()-1 + alignmentf.size());
		consensus.assign(alignmentr.rbegin(), alignmentr.rend()-1);
		consensus.insert(consensus.end(),
				alignmentf.begin(), alignmentf.end());

		// Determine the orientation of the alignment.
		unsigned dirs = alignedr << 2 | alignedf;
		static const dir_type DIRS[16] = {
			DIR_X, // 0000 XX impossible
			DIR_X, // 0001 XF impossible
			DIR_X, // 0010 XR impossible
			DIR_X, // 0011 XB impossible
			DIR_X, // 0100 FX impossible
			DIR_B, // 0101 FF u is subsumed in v
			DIR_R, // 0110 FR v->u
			DIR_R, // 0111 FB v->u
			DIR_X, // 1000 RX impossible
			DIR_F, // 1001 RF u->v
			DIR_B, // 1010 RR v is subsumed in u
			DIR_F, // 1011 RB u->v
			DIR_X, // 1100 BX impossible
			DIR_F, // 1101 BF u->v
			DIR_R, // 1110 BR v->u
			DIR_B, // 1111 BB u and v are equal
		};
		assert(dirs < 16);
		orientation = DIRS[dirs];
		assert(orientation != DIR_X);
	}
	return consensus;
}
Пример #24
0
/** Return an ambiguous path that agrees with all the given paths. */
static ContigPath constructAmbiguousPath(const Graph &g,
		const ContigNode& origin, const ContigPaths& paths)
{
	assert(!paths.empty());

	// Find the size of the smallest path.
	const ContigPath& firstSol = paths.front();
	size_t min_len = firstSol.size();
	for (ContigPaths::const_iterator it = paths.begin() + 1;
			it != paths.end(); ++it)
		min_len = min(min_len, it->size());

	// Find the longest prefix.
	ContigPath vppath;
	size_t longestPrefix;
	bool commonPrefix = true;
	for (longestPrefix = 0;
			longestPrefix < min_len; longestPrefix++) {
		const ContigNode& common_path_node = firstSol[longestPrefix];
		for (ContigPaths::const_iterator solIter = paths.begin();
				solIter != paths.end(); ++solIter) {
			const ContigNode& pathnode = (*solIter)[longestPrefix];
			if (pathnode != common_path_node) {
				// Found the longest prefix.
				commonPrefix = false;
				break;
			}
		}
		if (!commonPrefix)
			break;
		vppath.push_back(common_path_node);
	}

	// Find the longest suffix.
	ContigPath vspath;
	size_t longestSuffix;
	bool commonSuffix = true;
	for (longestSuffix = 0;
			longestSuffix < min_len-longestPrefix; longestSuffix++) {
		const ContigNode& common_path_node
			= firstSol[firstSol.size()-longestSuffix-1];
		for (ContigPaths::const_iterator solIter = paths.begin();
				solIter != paths.end(); ++solIter) {
			const ContigNode& pathnode
				= (*solIter)[solIter->size()-longestSuffix-1];
			if (pathnode != common_path_node) {
				// Found the longest suffix.
				commonSuffix = false;
				break;
			}
		}
		if (!commonSuffix)
			break;
		vspath.push_back(common_path_node);
	}

	ContigPath out;
	out.reserve(vppath.size() + 1 + vspath.size());
	out.insert(out.end(), vppath.begin(), vppath.end());
	if (longestSuffix > 0) {
		const ContigPath& longestPath(
				*max_element(paths.begin(), paths.end(),
					ComparePathLength(g, origin)));
		unsigned length = calculatePathLength(g, origin, longestPath,
				longestPrefix, longestSuffix);

		// Account for the overlap on the right.
		int dist = length + getDistance(g,
				longestSuffix == longestPath.size() ? origin
				: *(longestPath.rbegin() + longestSuffix),
				*(longestPath.rbegin() + longestSuffix - 1));

		// Add k-1 because it is the convention.
		int numN = dist + opt::k - 1;
		assert(numN > 0);

		out.push_back(ContigNode(numN, 'N'));
		out.insert(out.end(), vspath.rbegin(), vspath.rend());
	}
	return out;
}
Пример #25
0
	bool operator()(const ContigPath& a, const ContigPath& b) const {
		unsigned lenA = calculatePathLength(m_g, m_origin, a);
		unsigned lenB = calculatePathLength(m_g, m_origin, b);
		return lenA < lenB
			|| lenA == lenB && a.size() < b.size();
	}
Пример #26
0
/** Find an equivalent region of the two specified paths.
 * @param[out] orientation the orientation of the alignment
 * @return the consensus sequence
 */
static ContigPath align(const Lengths& lengths,
		const ContigPath& path1, const ContigPath& path2,
		ContigNode pivot, dir_type& orientation)
{
	if (&path1 == &path2) {
		// Ignore the trivial alignment when aligning a path to
		// itself.
	} else if (path1 == path2) {
		// These two paths are identical.
		orientation = DIR_B;
		return path1;
	} else {
		ContigPath::const_iterator it
			= search(path1.begin(), path1.end(),
				path2.begin(), path2.end());
		if (it != path1.end()) {
			// path2 is subsumed in path1.
			// Determine the orientation of the edge.
			orientation
				= it == path1.begin() ? DIR_R
				: it + path2.size() == path1.end() ? DIR_F
				: DIR_B;
			return path1;
		}
	}

	// Find a suitable pivot.
	if (find(path1.begin(), path1.end(), pivot) == path1.end()
			|| find(path2.begin(), path2.end(), pivot)
				== path2.end()) {
		bool good;
		tie(pivot, good) = findPivot(path1, path2);
		if (!good)
			return ContigPath();
	}
	assert(find(path1.begin(), path1.end(), pivot) != path1.end());

	ContigPath::const_iterator it2 = find(path2.begin(), path2.end(),
			pivot);
	assert(it2 != path2.end());
	if (&path1 != &path2) {
		// The seed must be unique in path2, unless we're aligning a
		// path to itself.
		assert(count(it2+1, path2.end(), pivot) == 0);
	}

	ContigPath consensus;
	for (ContigPath::const_iterator it1 = find_if(
				path1.begin(), path1.end(),
				bind2nd(equal_to<ContigNode>(), pivot));
			it1 != path1.end();
			it1 = find_if(it1+1, path1.end(),
				bind2nd(equal_to<ContigNode>(), pivot))) {
		if (&*it1 == &*it2) {
			// We are aligning a path to itself, and this is the
			// trivial alignment, which we'll ignore.
			continue;
		}
		consensus = align(lengths,
				path1, path2, it1, it2, orientation);
		if (!consensus.empty())
			return consensus;
	}
	return consensus;
}
Пример #27
0
/** Remove the overlapping portion of the specified contig. */
static void removeContigs(ContigPath& path,
		unsigned first, unsigned last)
{
	assert(first <= path.size());
	assert(last <= path.size());
	if (first < last) {
		recordTrimmedContigs(path.begin(), path.begin() + first);
		recordTrimmedContigs(path.begin() + last, path.end());
		path.erase(path.begin() + last, path.end());
		path.erase(path.begin(), path.begin() + first);
	} else {
		recordTrimmedContigs(path.begin(), path.end());
		path.clear();
	}
	removeAmbiguousContigs(path);
}
Пример #28
0
/** Return true if both paths are equal, ignoring ambiguous nodes. */
static bool equalIgnoreAmbiguos(const ContigPath& a,
		const ContigPath& b)
{
	return a.size() == b.size()
		&& equal(a.begin(), a.end(), b.begin(), equalOrBothAmbiguos);
}
Пример #29
0
/** Return whether this path is a cycle. */
static bool isCycle(const Lengths& lengths, const ContigPath& path)
{
	return !align(lengths, path, path, path.front()).empty();
}
Пример #30
0
/** Identify paths subsumed by the specified path.
 * @param overlaps [out] paths that are found to overlap
 * @return the ID of the subsuming path
 */
static ContigID identifySubsumedPaths(const Lengths& lengths,
		ContigPathMap::const_iterator path1It,
		ContigPathMap& paths,
		set<ContigID>& out,
		set<ContigID>& overlaps)
{
	ostringstream vout;
	out.clear();
	ContigID id(path1It->first);
	const ContigPath& path = path1It->second;
	if (gDebugPrint)
		vout << get(g_contigNames, ContigNode(id, false))
			<< '\t' << path << '\n';

	for (ContigPath::const_iterator it = path.begin();
			it != path.end(); ++it) {
		ContigNode pivot = *it;
		if (pivot.ambiguous() || pivot.id() == id)
			continue;
		ContigPathMap::iterator path2It
			= paths.find(pivot.contigIndex());
		if (path2It == paths.end())
			continue;
		ContigPath path2 = path2It->second;
		if (pivot.sense())
			reverseComplement(path2.begin(), path2.end());
		ContigPath consensus = align(lengths, path, path2, pivot);
		if (consensus.empty())
			continue;
		if (equalIgnoreAmbiguos(consensus, path)) {
			if (gDebugPrint)
				vout << get(g_contigNames, pivot)
					<< '\t' << path2 << '\n';
			out.insert(path2It->first);
		} else if (equalIgnoreAmbiguos(consensus, path2)) {
			// This path is larger. Use it as the seed.
			return identifySubsumedPaths(lengths, path2It, paths, out,
					overlaps);
		} else if (isCycle(lengths, consensus)) {
			// The consensus path is a cycle.
			bool isCyclePath1 = isCycle(lengths, path);
			bool isCyclePath2 = isCycle(lengths, path2);
			if (!isCyclePath1 && !isCyclePath2) {
				// Neither path is a cycle.
				if (gDebugPrint)
					vout << get(g_contigNames, pivot)
						<< '\t' << path2 << '\n'
						<< "ignored\t" << consensus << '\n';
				overlaps.insert(id);
				overlaps.insert(path2It->first);
			} else {
				// At least one path is a cycle.
				if (gDebugPrint)
					vout << get(g_contigNames, pivot)
						<< '\t' << path2 << '\n'
						<< "cycle\t" << consensus << '\n';
				if (isCyclePath1 && isCyclePath2)
					out.insert(path2It->first);
				else if (!isCyclePath1)
					overlaps.insert(id);
				else if (!isCyclePath2)
					overlaps.insert(path2It->first);
			}
		} else {
			if (gDebugPrint)
				vout << get(g_contigNames, pivot)
					<< '\t' << path2 << '\n'
					<< "ignored\t" << consensus << '\n';
			overlaps.insert(id);
			overlaps.insert(path2It->first);
		}
	}
	cout << vout.str();
	return id;
}