Exemple #1
0
/** Read contig paths from the specified file.
 * @param ids [out] the string ID of the paths
 */
static ContigPaths readPaths(const string& inPath,
		vector<string>* ids = NULL)
{
	if (ids != NULL)
		assert(ids->empty());
	ifstream fin(inPath.c_str());
	if (opt::verbose > 0)
		cerr << "Reading `" << inPath << "'..." << endl;
	if (inPath != "-")
		assert_good(fin, inPath);
	istream& in = inPath == "-" ? cin : fin;

	unsigned count = 0;
	ContigPaths paths;
	string id;
	ContigPath path;
	while (in >> id >> path) {
		paths.push_back(path);
		if (ids != NULL)
			ids->push_back(id);

		++count;
		if (opt::verbose > 1 && count % 1000000 == 0)
			cerr << "Read " << count << " paths. "
				"Using " << toSI(getMemoryUsage())
				<< "B of memory.\n";
	}
	if (opt::verbose > 0)
		cerr << "Read " << count << " paths. "
			"Using " << toSI(getMemoryUsage()) << "B of memory.\n";
	assert(in.eof());
	return paths;
}
Exemple #2
0
/** Finds all contigs used in each path in paths, and
 * marks them as seen in the vector seen. */
static void seenContigs(vector<bool>& seen, const ContigPaths& paths)
{
	for (ContigPaths::const_iterator it = paths.begin();
			it != paths.end(); ++it)
		for (ContigPath::const_iterator itc = it->begin();
				itc != it->end(); ++itc)
			if (itc->id() < seen.size())
				seen[itc->id()] = true;
}
Exemple #3
0
/** Mark contigs for removal. An empty path indicates that a contig
 * should be removed.
 */
static void markRemovedContigs(vector<bool>& marked,
		const vector<string>& pathIDs, const ContigPaths& paths)
{
	for (ContigPaths::const_iterator it = paths.begin();
			it != paths.end(); ++it) {
		if (it->empty()) {
			size_t i = get(g_contigNames,
					pathIDs[it - paths.begin()]);
			assert(i < marked.size());
			marked[i] = true;
		}
	}
}
Exemple #4
0
/** Return the set of contigs that appear more than once in a single
 * solution.
 */
static set<ContigID> findRepeats(ContigID seed,
	const ContigPaths& solutions)
{
	set<ContigID> repeats;
	for (ContigPaths::const_iterator solIt = solutions.begin();
			solIt != solutions.end(); ++solIt) {
		map<ContigID, unsigned> count;
		count[seed]++;
		for (ContigPath::const_iterator it = solIt->begin();
				it != solIt->end(); ++it)
			count[it->contigIndex()]++;
		for (map<ContigID, unsigned>::const_iterator
				it = count.begin(); it != count.end(); ++it)
			if (it->second > 1)
				repeats.insert(it->first);
	}
	return repeats;
}
Exemple #5
0
/** Read contig paths from the specified file.
 * @param[in] inPath the filename of the contig paths
 * @param[out] ids the string ID of the paths
 * @param[out] isAmb whether the path contains a gap
 */
static ContigPaths readPaths(const string& inPath,
	vector<string>& ids, vector<bool>& isAmb)
{
	typedef graph_traits<Graph>::vertex_descriptor V;

	assert(ids.empty());
	assert(isAmb.empty());
	assert(g_ambpath_contig.empty());
	ifstream fin(inPath.c_str());
	if (opt::verbose > 0)
		cerr << "Reading `" << inPath << "'..." << endl;
	if (inPath != "-")
		assert_good(fin, inPath);
	istream& in = inPath == "-" ? cin : fin;

	ContigPaths paths;
	string id;
	Path path;
	while (in >> id >> path) {
		paths.push_back(path);
		ids.push_back(id);
		isAmb.push_back(false);

		if (path.size() <= 2)
			continue;
		for (Path::iterator it = path.begin() + 2;
				it != path.end(); ++it) {
			ContigPath::value_type t = it[-2], u = it[-1], v = it[0];
			if (u.ambiguous()) {
				assert(!t.ambiguous());
				assert(!v.ambiguous());
				g_ambpath_contig.insert(AmbPath2Contig::value_type(
					AmbPathConstraint(t, v, u.length()),
					ContigPath()));
				isAmb.back() = true;
			}
		}
	}
	assert(in.eof());
	return paths;
}
/** Assemble the path overlap graph. */
static void assemblePathGraph(const Lengths& lengths,
		PathGraph& pathGraph, ContigPathMap& paths)
{
	ContigPaths seedPaths;
	assembleDFS(pathGraph, back_inserter(seedPaths));
	ContigPaths mergedPaths = mergeSeedPaths(lengths,
			paths, seedPaths);
	if (opt::verbose > 1)
		cout << '\n';

	// Replace each path with the merged path.
	for (ContigPaths::const_iterator it1 = seedPaths.begin();
			it1 != seedPaths.end(); ++it1) {
		const ContigPath& path(mergedPaths[it1 - seedPaths.begin()]);
		ContigPath pathrc(path);
		reverseComplement(pathrc.begin(), pathrc.end());
		for (ContigPath::const_iterator it2 = it1->begin();
				it2 != it1->end(); ++it2) {
			ContigNode seed(*it2);
			if (find(path.begin(), path.end(), seed) != path.end()) {
				paths[seed.contigIndex()]
					= seed.sense() ? pathrc : path;
			} else {
				// This seed was not included in the merged path.
			}
		}
	}

	removeRepeats(paths);

	// Remove the subsumed paths.
	if (opt::verbose > 0)
		cout << "Removing redundant contigs\n";
	removeSubsumedPaths(lengths, paths);

	outputSortedPaths(paths);
}
/** Merge the specified seed paths.
 * @return the merged contig paths
 */
static ContigPaths mergeSeedPaths(const Lengths& lengths,
		const ContigPathMap& paths, const ContigPaths& seedPaths)
{
	if (opt::verbose > 0)
		cout << "\nMerging paths\n";

	ContigPaths out;
	out.reserve(seedPaths.size());
	for (ContigPaths::const_iterator it = seedPaths.begin();
			it != seedPaths.end(); ++it)
		out.push_back(mergePath(lengths, paths, *it));
	return out;
}
Exemple #8
0
/** Output the updated overlap graph. */
static void outputGraph(Graph& g,
		const vector<string>& pathIDs, const ContigPaths& paths,
		const string& commandLine)
{
	typedef graph_traits<Graph>::vertex_descriptor V;

	// Add the path vertices.
	g_contigNames.unlock();
	for (ContigPaths::const_iterator it = paths.begin();
			it != paths.end(); ++it) {
		const ContigPath& path = *it;
		const string& id = pathIDs[it - paths.begin()];
		if (!path.empty()) {
			V u = merge(g, path.begin(), path.end());
			put(vertex_name, g, u, id);
		}
	}
	g_contigNames.lock();

	// Remove the vertices that are used in paths.
	for (ContigPaths::const_iterator it = paths.begin();
			it != paths.end(); ++it) {
		const ContigPath& path = *it;
		const string& id = pathIDs[it - paths.begin()];
		if (path.empty()) {
			remove_vertex(find_vertex(id, false, g), g);
		} else {
			remove_vertex_if(g, path.begin(), path.end(),
					not1(std::mem_fun_ref(&ContigNode::ambiguous)));
		}
	}

	// Output the graph.
	const string& graphPath = opt::graphPath;
	assert(!graphPath.empty());
	if (opt::verbose > 0)
		cerr << "Writing `" << graphPath << "'..." << endl;
	ofstream fout(graphPath.c_str());
	assert_good(fout, graphPath);
	write_graph(fout, g, PROGRAM, commandLine);
	assert_good(fout, graphPath);
	if (opt::verbose > 0)
		printGraphStats(cerr, g);
}
Exemple #9
0
/** Find a path for the specified distance estimates.
 * @param out [out] the solution path
 */
static void handleEstimate(const Graph& g,
		const EstimateRecord& er, bool dirIdx,
		ContigPath& out)
{
	if (er.estimates[dirIdx].empty())
		return;

	ContigNode origin(er.refID, dirIdx);
	ostringstream vout_ss;
	ostream bitBucket(NULL);
	ostream& vout = opt::verbose > 0 ? vout_ss : bitBucket;
	vout << "\n* " << get(vertex_name, g, origin) << '\n';

	unsigned minNumPairs = UINT_MAX;
	// generate the reachable set
	Constraints constraints;
	for (Estimates::const_iterator iter
				= er.estimates[dirIdx].begin();
			iter != er.estimates[dirIdx].end(); ++iter) {
		ContigNode v = iter->first;
		const DistanceEst& ep = iter->second;
		minNumPairs = min(minNumPairs, ep.numPairs);
		constraints.push_back(Constraint(v,
					ep.distance + allowedError(ep.stdDev)));
	}

	vout << "Constraints:";
	printConstraints(vout, g, constraints) << '\n';

	ContigPaths solutions;
	unsigned numVisited = 0;
	constrainedSearch(g, origin, constraints, solutions, numVisited);
	bool tooComplex = numVisited >= opt::maxCost;
	bool tooManySolutions = solutions.size() > opt::maxPaths;

	set<ContigID> repeats = findRepeats(er.refID, solutions);
	if (!repeats.empty()) {
		vout << "Repeats:";
		for (set<ContigID>::const_iterator it = repeats.begin();
				it != repeats.end(); ++it)
			vout << ' ' << get(g_contigNames, *it);
		vout << '\n';
	}

	unsigned numPossiblePaths = solutions.size();
	if (numPossiblePaths > 0)
		vout << "Paths: " << numPossiblePaths << '\n';

	for (ContigPaths::iterator solIter = solutions.begin();
			solIter != solutions.end();) {
		vout << *solIter << '\n';

		// Calculate the path distance to each node and see if
		// it is within the estimated distance.
		map<ContigNode, int> distanceMap
			= makeDistanceMap(g, origin, *solIter);

		// Remove solutions whose distance estimates are not correct.
		unsigned validCount = 0, invalidCount = 0, ignoredCount = 0;
		for (Estimates::const_iterator iter
					= er.estimates[dirIdx].begin();
				iter != er.estimates[dirIdx].end(); ++iter) {
			ContigNode v = iter->first;
			const DistanceEst& ep = iter->second;
			vout << get(vertex_name, g, v) << ',' << ep << '\t';

			map<ContigNode, int>::iterator dmIter
				= distanceMap.find(v);
			if (dmIter == distanceMap.end()) {
				// This contig is a repeat.
				ignoredCount++;
				vout << "ignored\n";
				continue;
			}

			// translate distance by -overlap to match
			// coordinate space used by the estimate
			int actualDistance = dmIter->second;
			int diff = actualDistance - ep.distance;
			unsigned buffer = allowedError(ep.stdDev);
			bool invalid = (unsigned)abs(diff) > buffer;
			bool repeat = repeats.count(v.contigIndex()) > 0;
			bool ignored = invalid && repeat;
			if (ignored)
				ignoredCount++;
			else if (invalid)
				invalidCount++;
			else
				validCount++;
			vout << "dist: " << actualDistance
				<< " diff: " << diff
				<< " buffer: " << buffer
				<< " n: " << ep.numPairs
				<< (ignored ? " ignored" : invalid ? " invalid" : "")
				<< '\n';
		}

		if (invalidCount == 0 && validCount > 0)
			++solIter;
		else
			solIter = solutions.erase(solIter);
	}

	vout << "Solutions: " << solutions.size();
	if (tooComplex)
		vout << " (too complex)";
	if (tooManySolutions)
		vout << " (too many solutions)";
	vout << '\n';

	ContigPaths::iterator bestSol = solutions.end();
	int minDiff = 999999;
	for (ContigPaths::iterator solIter = solutions.begin();
			solIter != solutions.end(); ++solIter) {
		map<ContigNode, int> distanceMap
			= makeDistanceMap(g, origin, *solIter);
		int sumDiff = 0;
		for (Estimates::const_iterator iter
					= er.estimates[dirIdx].begin();
				iter != er.estimates[dirIdx].end(); ++iter) {
			ContigNode v = iter->first;
			const DistanceEst& ep = iter->second;
			if (repeats.count(v.contigIndex()) > 0)
				continue;
			map<ContigNode, int>::iterator dmIter
				= distanceMap.find(v);
			assert(dmIter != distanceMap.end());
			int actualDistance = dmIter->second;
			int diff = actualDistance - ep.distance;
			sumDiff += abs(diff);
		}

		if (sumDiff < minDiff) {
			minDiff = sumDiff;
			bestSol = solIter;
		}

		vout << *solIter
			<< " length: " << calculatePathLength(g, origin, *solIter)
			<< " sumdiff: " << sumDiff << '\n';
	}

	/** Lock the debugging stream. */
	static pthread_mutex_t coutMutex = PTHREAD_MUTEX_INITIALIZER;
	pthread_mutex_lock(&coutMutex);
	stats.totalAttempted++;
	g_minNumPairs = min(g_minNumPairs, minNumPairs);

	if (tooComplex) {
		stats.tooComplex++;
	} else if (tooManySolutions) {
		stats.tooManySolutions++;
	} else if (numPossiblePaths == 0) {
		stats.noPossiblePaths++;
	} else if (solutions.empty()) {
		stats.noValidPaths++;
	} else if (repeats.count(er.refID) > 0) {
		vout << "Repeat: " << get(vertex_name, g, origin) << '\n';
		stats.repeat++;
	} else if (solutions.size() > 1) {
		ContigPath path
			= constructAmbiguousPath(g, origin, solutions);
		if (!path.empty()) {
			if (opt::extend)
				extend(g, path.back(), back_inserter(path));
			vout << path << '\n';
			if (opt::scaffold) {
				out.insert(out.end(), path.begin(), path.end());
				g_minNumPairsUsed
					= min(g_minNumPairsUsed, minNumPairs);
			}
		}
		stats.multiEnd++;
	} else {
		assert(solutions.size() == 1);
		assert(bestSol != solutions.end());
		ContigPath& path = *bestSol;
		if (opt::verbose > 1)
			printDistanceMap(vout, g, origin, path);
		if (opt::extend)
			extend(g, path.back(), back_inserter(path));
		out.insert(out.end(), path.begin(), path.end());
		stats.uniqueEnd++;
		g_minNumPairsUsed = min(g_minNumPairsUsed, minNumPairs);
	}
	cout << vout_ss.str();
	if (!out.empty())
		assert(!out.back().ambiguous());
	pthread_mutex_unlock(&coutMutex);
}
Exemple #10
0
/** Return an ambiguous path that agrees with all the given paths. */
static ContigPath constructAmbiguousPath(const Graph &g,
		const ContigNode& origin, const ContigPaths& paths)
{
	assert(!paths.empty());

	// Find the size of the smallest path.
	const ContigPath& firstSol = paths.front();
	size_t min_len = firstSol.size();
	for (ContigPaths::const_iterator it = paths.begin() + 1;
			it != paths.end(); ++it)
		min_len = min(min_len, it->size());

	// Find the longest prefix.
	ContigPath vppath;
	size_t longestPrefix;
	bool commonPrefix = true;
	for (longestPrefix = 0;
			longestPrefix < min_len; longestPrefix++) {
		const ContigNode& common_path_node = firstSol[longestPrefix];
		for (ContigPaths::const_iterator solIter = paths.begin();
				solIter != paths.end(); ++solIter) {
			const ContigNode& pathnode = (*solIter)[longestPrefix];
			if (pathnode != common_path_node) {
				// Found the longest prefix.
				commonPrefix = false;
				break;
			}
		}
		if (!commonPrefix)
			break;
		vppath.push_back(common_path_node);
	}

	// Find the longest suffix.
	ContigPath vspath;
	size_t longestSuffix;
	bool commonSuffix = true;
	for (longestSuffix = 0;
			longestSuffix < min_len-longestPrefix; longestSuffix++) {
		const ContigNode& common_path_node
			= firstSol[firstSol.size()-longestSuffix-1];
		for (ContigPaths::const_iterator solIter = paths.begin();
				solIter != paths.end(); ++solIter) {
			const ContigNode& pathnode
				= (*solIter)[solIter->size()-longestSuffix-1];
			if (pathnode != common_path_node) {
				// Found the longest suffix.
				commonSuffix = false;
				break;
			}
		}
		if (!commonSuffix)
			break;
		vspath.push_back(common_path_node);
	}

	ContigPath out;
	out.reserve(vppath.size() + 1 + vspath.size());
	out.insert(out.end(), vppath.begin(), vppath.end());
	if (longestSuffix > 0) {
		const ContigPath& longestPath(
				*max_element(paths.begin(), paths.end(),
					ComparePathLength(g, origin)));
		unsigned length = calculatePathLength(g, origin, longestPath,
				longestPrefix, longestSuffix);

		// Account for the overlap on the right.
		int dist = length + getDistance(g,
				longestSuffix == longestPath.size() ? origin
				: *(longestPath.rbegin() + longestSuffix),
				*(longestPath.rbegin() + longestSuffix - 1));

		// Add k-1 because it is the convention.
		int numN = dist + opt::k - 1;
		assert(numN > 0);

		out.push_back(ContigNode(numN, 'N'));
		out.insert(out.end(), vspath.rbegin(), vspath.rend());
	}
	return out;
}
Exemple #11
0
/** Return the consensus sequence of the specified gap. */
static ContigPath fillGap(const Graph& g,
		const AmbPathConstraint& apConstraint,
		vector<bool>& seen,
		ofstream& outFasta)
{
	if (opt::verbose > 1)
		cerr << "\n* "
			<< get(vertex_name, g, apConstraint.source) << ' '
			<< apConstraint.dist << "N "
			<< get(vertex_name, g, apConstraint.dest) << '\n';

	Constraints constraints;
	constraints.push_back(Constraint(apConstraint.dest,
				apConstraint.dist + opt::distanceError));

	ContigPaths solutions;
	unsigned numVisited = 0;
	constrainedSearch(g, apConstraint.source,
			constraints, solutions, numVisited);
	bool tooComplex = numVisited >= opt::maxCost;

	for (ContigPaths::iterator solIt = solutions.begin();
			solIt != solutions.end(); solIt++)
		solIt->insert(solIt->begin(), apConstraint.source);

	ContigPath consensus;
	bool tooManySolutions = solutions.size() > opt::numBranches;
	if (tooComplex) {
		stats.tooComplex++;
		if (opt::verbose > 1)
			cerr << solutions.size() << " paths (too complex)\n";
	} else if (tooManySolutions) {
		stats.numTooManySolutions++;
		if (opt::verbose > 1)
			cerr << solutions.size() << " paths (too many)\n";
	} else if (solutions.empty()) {
		stats.numNoSolutions++;
		if (opt::verbose > 1)
			cerr << "no paths\n";
	} else if (solutions.size() == 1) {
		if (opt::verbose > 1)
			cerr << "1 path\n" << solutions.front() << '\n';
		stats.numMerged++;
	} else {
		assert(solutions.size() > 1);
		if (opt::verbose > 2)
			copy(solutions.begin(), solutions.end(),
					ostream_iterator<ContigPath>(cerr, "\n"));
		else if (opt::verbose > 1)
			cerr << solutions.size() << " paths\n";
		consensus = align(g, solutions, outFasta);
		if (!consensus.empty()) {
			stats.numMerged++;
			// Mark contigs that are used in a consensus.
			markSeen(seen, solutions, true);
			if (opt::verbose > 1)
				cerr << consensus << '\n';
		} else
			stats.notMerged++;
	}
	return consensus;
}
Exemple #12
0
/* Resolve ambiguous region using pairwise alignment
 * (Needleman-Wunsch) ('solutions' contain exactly two paths, from a
 * source contig to a dest contig)
 */
static ContigPath alignPair(const Graph& g,
		const ContigPaths& solutions, ofstream& out)
{
	assert(solutions.size() == 2);
	assert(solutions[0].size() > 1);
	assert(solutions[1].size() > 1);
	assert(solutions[0].front() == solutions[1].front());
	assert(solutions[0].back() == solutions[1].back());
	ContigPath fstSol(solutions[0].begin()+1, solutions[0].end()-1);
	ContigPath sndSol(solutions[1].begin()+1, solutions[1].end()-1);

	if (fstSol.empty() || sndSol.empty()) {
		// This entire sequence may be deleted.
		const ContigPath& sol(fstSol.empty() ? sndSol : fstSol);
		assert(!sol.empty());
		Sequence consensus(mergePath(g, sol));
		assert(consensus.size() > opt::k - 1);
		string::iterator first = consensus.begin() + opt::k - 1;
		transform(first, consensus.end(), first, ::tolower);

		unsigned match = opt::k - 1;
		float identity = (float)match / consensus.size();
		if (opt::verbose > 2)
			cerr << consensus << '\n';
		if (opt::verbose > 1)
			cerr << identity
				<< (identity < opt::identity ? " (too low)\n" : "\n");
		if (identity < opt::identity)
			return ContigPath();

		unsigned coverage = calculatePathProperties(g, sol).coverage;
		ContigNode u = outputNewContig(g,
				solutions, 1, 1, consensus, coverage, out);
		ContigPath path;
		path.push_back(solutions.front().front());
		path.push_back(u);
		path.push_back(solutions.front().back());
		return path;
	}

	Sequence fstPathContig(mergePath(g, fstSol));
	Sequence sndPathContig(mergePath(g, sndSol));
	if (fstPathContig == sndPathContig) {
		// These two paths have identical sequence.
		if (fstSol.size() == sndSol.size()) {
			// A perfect match must be caused by palindrome.
			typedef ContigPath::const_iterator It;
			pair<It, It> it = mismatch(
					fstSol.begin(), fstSol.end(), sndSol.begin());
			assert(it.first != fstSol.end());
			assert(it.second != sndSol.end());
			assert(*it.first
					== get(vertex_complement, g, *it.second));
			assert(equal(it.first+1, It(fstSol.end()), it.second+1));
			if (opt::verbose > 1)
				cerr << "Palindrome: "
					<< get(vertex_contig_name, g, *it.first) << '\n';
			return solutions[0];
		} else {
			// The paths are different lengths.
			cerr << PROGRAM ": warning: "
				"Two paths have identical sequence, which may be "
				"caused by a transitive edge in the overlap graph.\n"
				<< '\t' << fstSol << '\n'
				<< '\t' << sndSol << '\n';
			return solutions[fstSol.size() > sndSol.size() ? 0 : 1];
		}
	}

	unsigned minLength = min(
			fstPathContig.length(), sndPathContig.length());
	unsigned maxLength = max(
			fstPathContig.length(), sndPathContig.length());
	float lengthRatio = (float)minLength / maxLength;
	if (lengthRatio < opt::identity) {
		if (opt::verbose > 1)
			cerr << minLength << '\t' << maxLength
				<< '\t' << lengthRatio << "\t(different length)\n";
		return ContigPath();
	}

	NWAlignment align;
	unsigned match = alignGlobal(fstPathContig, sndPathContig,
		   	align);
	float identity = (float)match / align.size();
	if (opt::verbose > 2)
		cerr << align;
	if (opt::verbose > 1)
		cerr << identity
			<< (identity < opt::identity ? " (too low)\n" : "\n");
	if (identity < opt::identity)
		return ContigPath();

	unsigned coverage = calculatePathProperties(g, fstSol).coverage
		+ calculatePathProperties(g, sndSol).coverage;
	ContigNode u = outputNewContig(g, solutions, 1, 1,
			align.consensus(), coverage, out);
	ContigPath path;
	path.push_back(solutions.front().front());
	path.push_back(u);
	path.push_back(solutions.front().back());
	return path;
}