Пример #1
0
void subgraph()
{
    StringGraph* pGraph = SGUtil::loadASQG(opt::asqgFile, 0, true, opt::maxEdges);
    pGraph->printMemSize();

    // Remove containments from the graph
    SGContainRemoveVisitor containVisit;
    std::cout << "Removing contained vertices\n";
    while(pGraph->hasContainment())
    {
        pGraph->visit(containVisit);
    }
    if (opt::maxOverlap)
    {
        SGMaximalOverlapVisitor moVisit(-1);
        // Remove non-maximal overlap edges
        std::cout << "Removing non-maximal overlap edges from graph\n";
        pGraph->visit(moVisit);
    }
    StringGraph* pSubgraph = new StringGraph;

    // Set the graph parameters to match the main graph
    pSubgraph->setContainmentFlag(pGraph->hasContainment());
    pSubgraph->setTransitiveFlag(pGraph->hasTransitive());
    pSubgraph->setMinOverlap(pGraph->getMinOverlap());
    pSubgraph->setErrorRate(pGraph->getErrorRate());

    // Get the root vertex
    Vertex* pRootVertex = pGraph->getVertex(opt::rootID);
    if(pRootVertex == NULL)
    {
        std::cout << "Vertex " << opt::rootID << " not found in the graph.\n";
    }
    else
    {
        copyVertexToSubgraph(pSubgraph, pRootVertex);
        pRootVertex->setColor(GC_BLACK);

        // Recursively add neighbors
        addNeighborsToSubgraph(pRootVertex, pSubgraph, opt::span);

        // Write the subgraph
        pSubgraph->writeASQG(opt::outFile);
    }

    delete pSubgraph;
    delete pGraph;
}
Пример #2
0
StringGraph* SGUtil::loadASQG(const std::string& filename, const unsigned int minOverlap, 
                              bool allowContainments)
{
    // Initialize graph
    StringGraph* pGraph = new StringGraph;

    std::istream* pReader = createReader(filename);

    int stage = 0;
    int line = 0;
    std::string recordLine;
    while(getline(*pReader, recordLine))
    {
        ASQG::RecordType rt = ASQG::getRecordType(recordLine);
        switch(rt)
        {
            case ASQG::RT_HEADER:
            {
                if(stage != 0)
                {
                    std::cerr << "Error: Unexpected header record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::HeaderRecord headerRecord(recordLine);
                const SQG::IntTag& overlapTag = headerRecord.getOverlapTag();
                if(overlapTag.isInitialized())
                    pGraph->setMinOverlap(overlapTag.get());
                else
                    pGraph->setMinOverlap(0);

                const SQG::FloatTag& errorRateTag = headerRecord.getErrorRateTag();
                if(errorRateTag.isInitialized())
                    pGraph->setErrorRate(errorRateTag.get());
                
                const SQG::IntTag& containmentTag = headerRecord.getContainmentTag();
                if(containmentTag.isInitialized())
                    pGraph->setContainmentFlag(containmentTag.get());
                else
                    pGraph->setContainmentFlag(true); // conservatively assume containments are present

                const SQG::IntTag& transitiveTag = headerRecord.getTransitiveTag();
                if(!transitiveTag.isInitialized())
                {
                    std::cerr << "Warning: ASQG does not have transitive tag\n";
                    pGraph->setTransitiveFlag(true);
                }
                else
                {
                    pGraph->setTransitiveFlag(transitiveTag.get());
                }

                break;
            }
            case ASQG::RT_VERTEX:
            {
                // progress the stage if we are done the header
                if(stage == 0)
                    stage = 1;

                if(stage != 1)
                {
                    std::cerr << "Error: Unexpected vertex record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::VertexRecord vertexRecord(recordLine);
                const SQG::IntTag& ssTag = vertexRecord.getSubstringTag();

                Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(vertexRecord.getID(), vertexRecord.getSeq());
                if(ssTag.isInitialized() && ssTag.get() == 1)
                {
                    // Vertex is a substring of some other vertex, mark it as contained
                    pVertex->setContained(true);
                    pGraph->setContainmentFlag(true);
                }
                pGraph->addVertex(pVertex);
                break;
            }
            case ASQG::RT_EDGE:
            {
                if(stage == 1)
                    stage = 2;
                
                if(stage != 2)
                {
                    std::cerr << "Error: Unexpected edge record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::EdgeRecord edgeRecord(recordLine);
                const Overlap& ovr = edgeRecord.getOverlap();

                // Add the edge to the graph
                if(ovr.match.getMinOverlapLength() >= (int)minOverlap)
                    SGAlgorithms::createEdgesFromOverlap(pGraph, ovr, allowContainments);
                break;
            }
        }
        ++line;
    }

    // Remove any duplicate edges
    SGDuplicateVisitor dupVisit;
    pGraph->visit(dupVisit);

    SGGraphStatsVisitor statsVisit;
    pGraph->visit(statsVisit);
    // Remove identical vertices
    // This is much cheaper to do than remove via
    // SGContainRemove as no remodelling needs to occur
   /*
    SGIdenticalRemoveVisitor irv;
    pGraph->visit(irv);

    // Remove substring vertices
    while(pGraph->hasContainment())
    {
        SGContainRemoveVisitor crv;
        pGraph->visit(crv);
    }
*/
    delete pReader;
    return pGraph;
}
int assemble()
{
	StringGraph* pGraph;
	#pragma omp parallel
	{
		#pragma omp single nowait
		{
			std::cout << "\n[ Loading string graph: " << opt::asqgFile <<  " ]\n";
			pGraph=SGUtil::loadASQGVertex(opt::asqgFile, opt::minOverlap, true, opt::maxEdges);
		}
		#pragma omp single nowait
		{
			std::cout << "[ Loading BWT ]\n";
			opt::pBWT = new BWT(opt::prefix + BWT_EXT, BWT::DEFAULT_SAMPLE_RATE_SMALL);
		}
		#pragma omp single nowait
		{
			std::cout << "[ Loading RBWT ]\n";
			opt::pRBWT = new BWT(opt::prefix + RBWT_EXT, BWT::DEFAULT_SAMPLE_RATE_SMALL);
		}
		#pragma omp single nowait
		{
			std::cout << "[ Loading SAI ]\n";
			opt::pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);
		}
	}
    opt::indices.pBWT = opt::pBWT;
    opt::indices.pRBWT = opt::pRBWT;
    opt::indices.pSSA = opt::pSSA;
	
	pGraph=SGUtil::loadASQGEdge(opt::asqgFile, opt::minOverlap, true, opt::maxEdges, pGraph);

	// if(opt::bExact)
	pGraph->setExactMode(opt::bExact);
	//pGraph->printMemSize();

	// // Pre-assembly graph stats
	SGGraphStatsVisitor statsVisit;
	std::cout << "[Stats] Input graph:\n";
	pGraph->visitP(statsVisit);
	
	int phase = 0 ;

	// Remove containments from the graph
	std::cout << "Removing contained vertices from graph\n";
	SGContainRemoveVisitor containVisit;
	while(pGraph->hasContainment())
		pGraph->visit(containVisit);

	/*---Remove Transitive Edges---*/
	std::cout << "Removing transitive edges\n";
	SGTransitiveReductionVisitor trVisit;
	pGraph->visit(trVisit);
	/*---Remove Transitive Edges---*/
	// getchar();

	// Compact together unbranched chains of vertices
	std::cout << "Start to simplify unipaths ...\n";
    pGraph->simplify();
	std::cout << "[Stats] Simplified graph:\n";
	pGraph->visitP(statsVisit);


	/**********************Compute overlap raio and diff (for debug)**************
	std::ofstream ssol  ("simpleOverlapLength.histo", std::ofstream::out);
	std::map<size_t,int> simpleStats = pGraph->getCountMap() ;
	//Stats simple overlap length
	std::cout << "< Compute Stats of Overlap Length and Ratios>" << std::endl;
	//ssol << "Length\tCount" << std::endl;
	for (std::map<size_t,int> ::iterator iter=simpleStats.begin(); iter!=simpleStats.end(); ++iter)
		ssol << iter->first << "\t" << iter->second << std::endl;
	ssol.close();

	std::ofstream orStats  ("overlapRaito.stats", std::ofstream::out);
	std::vector < overlapRatioInfo > overlapRatioStats = pGraph->getOverlapRatioStats();
	for (std::vector < overlapRatioInfo >::iterator iter=overlapRatioStats.begin(); iter!=overlapRatioStats.end(); ++iter)
		orStats << (*iter).overlapLength  << "\t" << (*iter).originalLength << "\t"
		        << std::setiosflags(std::ios::fixed) << std::setprecision(3)
				<< (double)(*iter).overlapLength/(*iter).originalLength << "\t" << (*iter).ratioDiff << std::endl;

	orStats.close();
	**********************Compute overlap raio and diff***************************/
		
	// /*** Remove Illegal Kmer Edges ***/
	SGRemoveIllegalKmerEdgeVisitor ikeVisit(opt::pBWT,opt::kmerLength,opt::kmerThreshold, opt::credibleOverlapLength);
	std::cout << "\n[ Remove Illegal Kmer Edges due to kmerization ]" << std::endl;
	pGraph->visitP(ikeVisit);
	pGraph->simplify();

	/*** trim dead end vertices from small to large***/
	size_t trimLen = opt::kmerLength+1 , stepsize=opt::insertSize/5;
    while (trimLen < opt::insertSize *3/2)
    {
			SGTrimVisitor shortTrimVisit("",trimLen);
			std::cout << "[ Trimming short vertices (<" << trimLen  << ") ]\n";

			if(pGraph->visitP(shortTrimVisit))
				pGraph->simplify();

			 trimLen=trimLen+stepsize;
    }

	/*** Pop Bubbles ***/
	std::cout << "\n[ Remove bubbles and tips ]\n";
	graphTrimAndSmooth (pGraph, opt::maxChimeraLength);
	// outputGraphAndFasta(pGraph,"popBubbles",++phase);

	/*** Remove small chimeric vertices ***/
	std::cout << "\n[ Remove small chimera vertices ]\n";
	for (size_t threshold=2; threshold<=opt::kmerThreshold; threshold++)
		RemoveVertexWithBothShortEdges (pGraph, opt::readLength, opt::credibleOverlapLength, opt::pBWT, opt::kmerLength, threshold);

	//remove edges with overlap <= 1st overlap length peak (from unmerged original reads)
	RemoveVertexWithBothShortEdges (pGraph, opt::readLength, pGraph->getMinOverlap());
	RemoveVertexWithBothShortEdges (pGraph, opt::readLength, opt::credibleOverlapLength);
	RemoveVertexWithBothShortEdges (pGraph, opt::insertSize, opt::credibleOverlapLength);
	RemoveVertexWithBothShortEdges (pGraph, opt::maxChimeraLength, opt::credibleOverlapLength);
    
	pGraph->contigStats();
	pGraph->visit(statsVisit);
	outputGraphAndFasta(pGraph,"",++phase);

	   /*** remove edges from 1st peak to 2nd peak in the overlap length distribution
	1st peak: opt::credibleOverlapLength , 2nd peak: opt::insertSize*opt::minOverlapRatio 
    Define large vertex as opt::maxChimeraLength*4
    be very careful and little by little for avoiding misassembly ***/
    std::cout << "\n[ Remove chimeric edges with insufficient overlap or large-overlap difference from large vertices]\n";
	size_t stepsize2 = (opt::insertSize*opt::minOverlapRatio-opt::credibleOverlapLength)/4;	//iterate 4 times
    for (size_t len = opt::credibleOverlapLength ; len <= opt::insertSize*opt::minOverlapRatio ; len+=stepsize2 )	
	{
		SGRemoveByOverlapLenDiffVisitor srv1(1600, len, (opt::insertSize*opt::minOverlapRatio)+opt::credibleOverlapLength-len);	
		if(pGraph->visitP(srv1)) 
            graphTrimAndSmooth(pGraph, opt::maxChimeraLength);
    }

    // fix min overlap length= insertsize*opt::minOverlapRatio, remove more edges by overlap diff from large to small diff
	// iterate 2 times
    for (size_t stepsize3 = opt::credibleOverlapLength/4; stepsize3 <= opt::credibleOverlapLength/2 ; stepsize3+=stepsize3 )
    {
        SGRemoveByOverlapLenDiffVisitor srv1(1600, /*opt::insertSize*opt::minOverlapRatio*/ 0, opt::credibleOverlapLength-stepsize3);
        if(pGraph->visitP(srv1)) 
            graphTrimAndSmooth(pGraph, opt::maxChimeraLength);
    }

	// SGRemoveByOverlapLenDiffVisitor srv1(opt::maxChimeraLength*3, /*opt::insertSize*opt::minOverlapRatio*/ 0, opt::credibleOverlapLength/3);
    // if(pGraph->visitP(srv1)) 
            // graphTrimAndSmooth(pGraph, opt::maxChimeraLength);

	RemoveVertexWithBothShortEdges (pGraph, opt::readLength+100, opt::readLength*0.9);

	pGraph->contigStats(); 
	pGraph->visit(statsVisit);
	outputGraphAndFasta(pGraph,"",++phase);

	
	/*** Remove edges according to PE links, which remove small repeat vertices most of the time ***/
	for (size_t minPELink = 1; minPELink <= 1; minPELink++ )
    {
		// 51 is more accurate while 31 produces longer contigs
		SGRemoveEdgeByPEVisitor REPEVistit(opt::indices, opt::insertSize, 51, minPELink);
		if(pGraph->visitP(REPEVistit))
			graphTrimAndSmooth(pGraph, opt::maxChimeraLength);
    }

	pGraph->contigStats();
	pGraph->visit(statsVisit);
	outputGraphAndFasta(pGraph,"",++phase);

	
	/** Incrementally remove chimera edges from small to large chimeric vertices using overlap ratio 
	  Be very careful and little by little for avoiding misassembly 
	  This visitor resolves chimeric with size roughly equal to read length+100 (first peak)
	  Resolve larger vertices lead to misassembly ***/
    std::cout << "\n[ Remove chimeric edges from small vertices using overlap ratios ]\n";
	size_t stepsize4 = 15;
	for (size_t len = opt::readLength ; len <= opt::readLength+100 ; len+=stepsize4 )
		RemoveSmallOverlapRatioEdges ( pGraph, len);
	
	// Rename requires extra memory which should
	// only be done after the string graph has been greatly simplified.
	pGraph->renameVertices("");

	/******* Re-join broken islands/tips due to high-GC errors ********/
	size_t min_size_of_islandtip=opt::maxChimeraLength;

    /***************** 1. Trim bad ends of island/tip *****************/
	SGFastaErosionVisitor eFAVisit (opt::pBWT, opt::kmerLength, opt::kmerThreshold, min_size_of_islandtip);
	pGraph->visitP(eFAVisit);
	
	pGraph->contigStats();
	pGraph->visitP(statsVisit);
	outputGraphAndFasta(pGraph,"", ++phase);
    
    /*** 2. Collect read IDs mapped to large island/tip with size > min_size_of_islandtip ***/
	ThreadSafeListVector tslv;
	tslv.resize(opt::pSSA->getNumberOfReads());
    SGIslandCollectVisitor sgicv(&tslv, opt::indices, opt::insertSize, 51, min_size_of_islandtip);
    pGraph->visitP(sgicv);
    
	/*** 3. Join islands/tips with PE support using FM-index walk (depth,leaves,minoverlap)=(150, 2000, 19) ***/
	SGJoinIslandVisitor sgjiv(100, 4000, opt::kmerLength/2+4, min_size_of_islandtip, &tslv, opt::indices, 3);
	pGraph->visitProgress(sgjiv);
	graphTrimAndSmooth (pGraph, opt::maxChimeraLength);

	std::cout << "\n[Stats] Final graph statistic:\n";
	pGraph->contigStats();
	pGraph->visitP(statsVisit);
	
	// Write the results
	std::cout << "\n<Printing the contig file> : " << opt::outContigsFile << " \n" << std::endl;
	SGFastaVisitor av(opt::outContigsFile);
	pGraph->visit(av);
	//std::cout << "<Printing the ASQG and dot files>\n" << std::endl;
	pGraph->writeASQG(opt::outGraphFile);
    pGraph->writeDot("StriDe-graph.dot",0);

	delete pGraph;
	return 0;

}
Пример #4
0
void assemble()
{
    Timer t("sga assemble");
    StringGraph* pGraph = SGUtil::loadASQG(opt::asqgFile, opt::minOverlap, true);
    if(opt::bExact)
        pGraph->setExactMode(true);
    pGraph->printMemSize();

    // Visitor functors
    SGTransitiveReductionVisitor trVisit;
    SGGraphStatsVisitor statsVisit;
    SGRemodelVisitor remodelVisit;
    SGEdgeStatsVisitor edgeStatsVisit;
    SGTrimVisitor trimVisit(opt::trimLengthThreshold);
    SGBubbleVisitor bubbleVisit;
    SGBubbleEdgeVisitor bubbleEdgeVisit;

    SGContainRemoveVisitor containVisit;
    SGValidateStructureVisitor validationVisit;

    // Pre-assembly graph stats
    std::cout << "[Stats] Input graph:\n";
    pGraph->visit(statsVisit);    

    // Remove containments from the graph
    std::cout << "Removing contained vertices from graph\n";
    while(pGraph->hasContainment())
        pGraph->visit(containVisit);

    // Pre-assembly graph stats
    std::cout << "[Stats] After removing contained vertices:\n";
    pGraph->visit(statsVisit);    

    // Remove any extraneous transitive edges that may remain in the graph
    if(opt::bPerformTR)
    {
        std::cout << "Removing transitive edges\n";
        pGraph->visit(trVisit);
    }

    // Compact together unbranched chains of vertices
    pGraph->simplify();
    
    if(opt::bValidate)
    {
        std::cout << "Validating graph structure\n";
        pGraph->visit(validationVisit);
    }

    // Remove dead-end branches from the graph
    if(opt::numTrimRounds > 0)
    {
        std::cout << "Trimming bad vertices\n"; 
        int numTrims = opt::numTrimRounds;
        while(numTrims-- > 0)
           pGraph->visit(trimVisit);
        std::cout << "\n[Stats] Graph after trimming:\n";
        pGraph->visit(statsVisit);
    }

    // Resolve small repeats
    if(opt::resolveSmallRepeatLen > 0)
    {
        SGSmallRepeatResolveVisitor smallRepeatVisit(opt::resolveSmallRepeatLen);
        std::cout << "Resolving small repeats\n";

        int totalSmallRepeatRounds = 0;
        while(pGraph->visit(smallRepeatVisit))
            std::cout << "Finished small repeat resolve round " << totalSmallRepeatRounds++ << "\n";
        
        std::cout << "\n[Stats] After small repeat resolution:\n";
        pGraph->visit(statsVisit);
    }

    //
    if(opt::coverageCutoff > 0)
    {
        std::cout << "Coverage visit\n";
        SGCoverageVisitor coverageVisit(opt::coverageCutoff);
        pGraph->visit(coverageVisit);
        pGraph->visit(trimVisit);
        pGraph->visit(trimVisit);
        pGraph->visit(trimVisit);
    }

    // Peform another round of simplification
    pGraph->simplify();
    
    if(opt::numBubbleRounds > 0)
    {
        std::cout << "\nPerforming variation smoothing\n";
        SGSmoothingVisitor smoothingVisit(opt::outVariantsFile, opt::maxBubbleGapDivergence, opt::maxBubbleDivergence, opt::maxIndelLength);
        int numSmooth = opt::numBubbleRounds;
        while(numSmooth-- > 0)
            pGraph->visit(smoothingVisit);
        pGraph->simplify();
    }
    
    pGraph->renameVertices("contig-");

    std::cout << "\n[Stats] Final graph:\n";
    pGraph->visit(statsVisit);

    // Rename the vertices to have contig IDs instead of read IDs
    //pGraph->renameVertices("contig-");

    // Write the results
    SGFastaVisitor av(opt::outContigsFile);
    pGraph->visit(av);

    pGraph->writeASQG(opt::outGraphFile);

    delete pGraph;
}