Example #1
0
int main(int argc, char* argv[]) {
	
	if (argc < 4) {
		PrintUsage();
		exit(0);
	}

	string rgFileName, vertexSeqFileName, scaffoldDirName;
	
	rgFileName         = argv[1];
	vertexSeqFileName  = argv[2];
	scaffoldDirName    = argv[3];

	string repeatFileName = "";
	bool printRepeatsSeparately = false;
	int argi = 4;
	bool printSeparate=false;
	while (argi < argc) {
		if (strcmp(argv[argi], "-separate") == 0) {
			printSeparate=true;
		}
		else if (strcmp(argv[argi], "-repeats") == 0) {
			printRepeatsSeparately = true;
			repeatFileName = argv[++argi];
		}
		else {
			cout << "bad option: " << argv[argi] << endl;
			PrintUsage();
			exit(1);
		}
		++argi;
	}
	
	FASTAReader vertexSequenceReader;
	vertexSequenceReader.Init(vertexSeqFileName);

	//
	// Input necessary data
	//
	vector<FASTASequence> vertexSequences;
	vertexSequenceReader.ReadAllSequences(vertexSequences);
	RepeatGraph<string> rg;
	rg.ReadGraph(rgFileName);

	vector<FASTASequence> vertexRCSequences;
	VectorIndex vertexIndex;	
	vertexRCSequences.resize(vertexSequences.size());
	for (vertexIndex = 0; vertexIndex < vertexSequences.size(); vertexIndex++ ){
		vertexSequences[vertexIndex].MakeRC(vertexRCSequences[vertexIndex]);
	}
	
	VectorIndex outEdgeIndex;
	int scaffoldIndex = 0;
	ofstream scaffoldOut;

	if (printSeparate==false) {
		// scaffold dir name is really a file name here.
		CrucialOpen(scaffoldDirName, scaffoldOut, std::ios::out);
	}
	for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){
		rg.vertices[vertexIndex].traversed = false;
	}

	//
	// Set up flow for calling multiplicity.
	//
	/*
		Test all this out later.
	AssignMinimumFlowToEdges(rg, 2);
	AssignVertexFlowBalance(rg);
	BalanceKirchhoffFlow(rg);

	UInt edgeIndex;
	for (edgeIndex = 0; edgeIndex < rg.edges.size(); edgeIndex++) {
		if (rg.edges[edgeIndex].flow > 1) {
			cout << edgeIndex << " " << rg.edges[edgeIndex].flow << endl;
		}
	}
	*/

	int numPrintedVertices = 0;
	for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){
		//
		// Look to see if this vertex is a branching vertex.
		//
		if ((rg.vertices[vertexIndex].inEdges.size() != 1 or
				 rg.vertices[vertexIndex].outEdges.size() != 1) and
				rg.vertices[vertexIndex].traversed == false) {

			//
			// This is a branching vertex.  Print all paths from this vertex, but not the vertex
			// itself if it appears repetitive. 
			//
			VectorIndex outEdgeIndex;
			bool printedThisVertex = false;
			for (outEdgeIndex = 0; outEdgeIndex < rg.vertices[vertexIndex].outEdges.size(); outEdgeIndex++ ){
				//
				// This is a branching vertex.
				// 

				VectorIndex pathIndex;
				stringstream scaffoldFileNameStrm;
				cout << " printing scaffold: " << scaffoldIndex << endl;
				if (printSeparate) {
					scaffoldFileNameStrm << scaffoldDirName << "/" << scaffoldIndex << ".fasta";
					string scaffoldFileName = scaffoldFileNameStrm.str();
					CrucialOpen(scaffoldFileName, scaffoldOut, std::ios::out);
				}			
				++scaffoldIndex;

				//
				// Store the nonbranching path in a list so that it may be quickly processed.
				//
				bool pathIsPrinted = false;
				vector<VectorIndex> path;
				if (rg.vertices[vertexIndex].InDegree() == 0 and rg.vertices[vertexIndex].OutDegree() == 1) {
					path.push_back(vertexIndex);
				}
				VectorIndex pathVertex = rg.edges[rg.vertices[vertexIndex].outEdges[outEdgeIndex]].dest;				
				while(rg.vertices[pathVertex].inEdges.size() == 1 and
							rg.vertices[pathVertex].outEdges.size() == 1) {
					if (rg.vertices[pathVertex].traversed == true) {
						pathIsPrinted = true;
						break;
					}
					path.push_back(pathVertex);
					// Mark the forward and reverse complement as traversed.
					pathVertex = rg.edges[rg.vertices[pathVertex].outEdges[0]].dest;
					//
				}
				//
				// Look to see if this is the end of a simple path, if so, add it to the scaffold.
				//
				pathVertex = rg.edges[rg.vertices[vertexIndex].outEdges[outEdgeIndex]].dest;				
				if (rg.vertices[pathVertex].OutDegree() == 0 and rg.vertices[pathVertex].InDegree() == 1) {
					path.push_back(pathVertex);
				}
				//
				// Determine the sequences in the scaffold and the total scaffold length.
				//
				if (pathIsPrinted == false) {
					VectorIndex p;
					DNALength scaffoldLength = 0;
					for (p = 0; p < path.size(); p++ ){
						scaffoldLength += vertexSequences[path[p]/2].length;
						rg.vertices[path[p]].traversed = true;
						//					rg.vertices[2*(path[p]/2)+ !(path[p]%2)].traversed = true;
						++numPrintedVertices;
					}
					cout << "path is of size " << path.size() << " length " << scaffoldLength << endl;
					if (!printSeparate) {
						scaffoldOut << ">" << scaffoldIndex << " " << path.size() << " " << scaffoldLength << endl;
					}
					for (p = 0; p < path.size(); p++) {
						if (printSeparate) {
							scaffoldOut << ">" << p << " " << path[p]/2 << " " << vertexSequences[path[p]/2].length << endl;
						}
						if (path[p]%2 == 0) {
							((DNASequence)vertexSequences[path[p]/2]).PrintSeq(scaffoldOut);
						}
						else {
							((DNASequence)vertexRCSequences[path[p]/2]).PrintSeq(scaffoldOut);
						}
						rg.vertices[path[p]].traversed = true;
						rg.vertices[2*(path[p]/2) + !(path[p]%2)].traversed = true;
					}
					if (printSeparate) {
						scaffoldOut.close();
						scaffoldOut.clear();
					}
				}
			}
		}
	}

	ofstream* outPtr;
	ofstream repeatOut;
	if (printRepeatsSeparately) {
		CrucialOpen(repeatFileName, repeatOut, std::ios::out);
		outPtr = &repeatOut;
	}
	else {
		outPtr = &scaffoldOut;
	}

	for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){
		if (rg.vertices[vertexIndex].traversed == false) {
			//
			// Print this vertex sequence only.  It is repetitive, or isolated.
			//
			*outPtr << ">" << scaffoldIndex << endl;
			++scaffoldIndex;
			if (vertexIndex%2 == 0) {
				((DNASequence)vertexSequences[vertexIndex/2]).PrintSeq(*outPtr);
			}
			else {
				((DNASequence)vertexRCSequences[vertexIndex/2]).PrintSeq(*outPtr);
			}
			rg.vertices[vertexIndex].traversed = true;
			rg.vertices[2*(vertexIndex/2)+ !(vertexIndex%2)].traversed = true;
		}
	}	

	cout << "printed: " << numPrintedVertices << " of " << rg.vertices.size() << endl;
}
int main(int argc, char* argv[]) {
	string rgInName, rgOutName;
	int minCoverage;

	if (argc < 4) {
		cout << "usage: removeTransitiveOverlaps in.rg minCoverage out.rg" << endl;
		exit(1);
	}

	rgInName    = argv[1];
	minCoverage = atoi(argv[2]);
	rgOutName   = argv[3];

	ofstream rgOut;
	CrucialOpen(rgOutName, rgOut, std::ios::out);

	RepeatGraph<string> rg;
	
	rg.ReadGraph(rgInName);

	VectorIndex vertexIndex;
	VectorIndex outEdgeIndex;
	VectorIndex edgeIndex;
	
	if (rg.edges.size() == 0) {
		cout << "LIKELY INVALID GRAPH. There are no edges." << endl;
		return 0;
	}
	//
	// At first, any edge that exists is connected to a vertex. This
	// will change as low coverage edges are deleted and replaced by
	// high coverage edges from the end of the array.
	//
	for (edgeIndex = 0; edgeIndex < rg.edges.size(); edgeIndex++) {
		rg.edges[edgeIndex].connected = true;
	}
	
	VectorIndex numEdges = rg.edges.size();
	for (edgeIndex = 0; edgeIndex < numEdges; ) {
		if (rg.edges[edgeIndex].count >= minCoverage) {
			// This edge is fine.
			edgeIndex++;
		}
		else {
			// This edge needs to be deleted. Find the first edge at the end that
			// will not be deleted anyway, and move it here.

			while (numEdges > edgeIndex and rg.edges[numEdges-1].count < minCoverage) {
				UnlinkDirected(rg.vertices, rg.edges, numEdges-1);
				numEdges--;
			}

			//
			// If exhausted all edges, just break since all are deleted.
			//
			if (numEdges == edgeIndex) {
				continue;
			}
			VectorIndex src  = rg.edges[edgeIndex].src;
			VectorIndex dest = rg.edges[edgeIndex].dest;
			//
			// Get rid of this low coverage edge.
			UnlinkDirected(rg.vertices, rg.edges, edgeIndex);

			//
			// Pack in one from a higher coverage.
			rg.edges[edgeIndex] = rg.edges[numEdges-1];

			//
			// Update the connecting vertex.
			UpdateOutEdge(rg.vertices, rg.edges, numEdges-1, edgeIndex);
			UpdateInEdge(rg.vertices, rg.edges, numEdges-1, edgeIndex);
			--numEdges;

			++edgeIndex;
		}
	}
	rg.edges.resize(numEdges);
	rg.WriteGraph(rgOut);
	return 0;
}
int main(int argc, char* argv[]) {
	string rgInName, rgOutName;
	int minPathLength;
	string vertexSequenceFileName;
	if (argc < 5) {
		cout << "usage: trimShortEnds in.rg  vertexSequences minPathLength out.rg" << endl;
		exit(1);
	}

	rgInName      = argv[1];
	vertexSequenceFileName = argv[2];
	minPathLength = atoi(argv[3]);
	rgOutName     = argv[4];

	ofstream rgOut;
	CrucialOpen(rgOutName, rgOut, std::ios::out);
	FASTAReader vertexSequenceReader;
	vertexSequenceReader.Init(vertexSequenceFileName);

	RepeatGraph<string> rg;
	vector<FASTASequence> vertexSequences;
	rg.ReadGraph(rgInName);
	vertexSequenceReader.ReadAllSequences(vertexSequences);

	VectorIndex vertexIndex;
	VectorIndex outEdgeIndex;
	VectorIndex edgeIndex;
	
	if (rg.edges.size() == 0) {
		cout << "LIKELY INVALID GRAPH. There are no edges." << endl;
		return 0;
	}
	//
	// At first, any edge that exists is connected to a vertex. This
	// will change as low coverage edges are deleted and replaced by
	// high coverage edges from the end of the array.
	//
	for (edgeIndex = 0; edgeIndex < rg.edges.size(); edgeIndex++) {
		rg.edges[edgeIndex].connected = true;
	}
	set<std::pair<VectorIndex, VectorIndex> > srcDestToRemove;
	
	for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++) {
		if (rg.vertices[vertexIndex].inEdges.size() == 0 and
				rg.vertices[vertexIndex].outEdges.size() == 1) {
			//
			// This is a source.  Traverse this until a branching vertex or the end is found.
			//
			vector<VectorIndex> path;
			path.push_back(vertexIndex);
			int pathLength = 0;
			VectorIndex pathVertex;
			VectorIndex pathEdge;
			pathEdge = rg.vertices[vertexIndex].outEdges[0];
			pathVertex = rg.edges[pathEdge].dest;
			while (rg.vertices[pathVertex].inEdges.size() == 1 and
						 rg.vertices[pathVertex].outEdges.size() == 1) {
				path.push_back(pathVertex);
				pathEdge   =  rg.vertices[pathVertex].outEdges[0];
				pathVertex =  rg.edges[pathEdge].dest;
				pathLength += vertexSequences[pathVertex/2].length;
			}
			pathLength += vertexSequences[pathVertex/2].length;
			path.push_back(pathVertex);
			if (pathLength < minPathLength and path.size() < 3) {
				//
				// Remove this path, it is too short.
				// Also remove the complement.
				//
				cout << "trimming path of " << path.size() << " is of sequence length " << pathLength << endl;

				VectorIndex pathIndex;
				for (pathIndex = 0; pathIndex < path.size() - 1; pathIndex++) {
					srcDestToRemove.insert(pair<VectorIndex, VectorIndex>(path[pathIndex], path[pathIndex+1]));
					srcDestToRemove.insert(pair<VectorIndex, VectorIndex>(2*(path[pathIndex+1]/2) + !(path[pathIndex+1]%2),
																																2*(path[pathIndex]/2) + !(path[pathIndex]%2)));
				}
			}
		}
	}

	MarkEdgePairsForRemoval(srcDestToRemove, rg.vertices, rg.edges);
	RemoveUnconnectedEdges(rg.vertices, rg.edges);

	rg.WriteGraph(rgOut);
	return 0;
}