void subgraph() { StringGraph* pGraph = SGUtil::loadASQG(opt::asqgFile, 0, true, opt::maxEdges); pGraph->printMemSize(); // Remove containments from the graph SGContainRemoveVisitor containVisit; std::cout << "Removing contained vertices\n"; while(pGraph->hasContainment()) { pGraph->visit(containVisit); } if (opt::maxOverlap) { SGMaximalOverlapVisitor moVisit(-1); // Remove non-maximal overlap edges std::cout << "Removing non-maximal overlap edges from graph\n"; pGraph->visit(moVisit); } StringGraph* pSubgraph = new StringGraph; // Set the graph parameters to match the main graph pSubgraph->setContainmentFlag(pGraph->hasContainment()); pSubgraph->setTransitiveFlag(pGraph->hasTransitive()); pSubgraph->setMinOverlap(pGraph->getMinOverlap()); pSubgraph->setErrorRate(pGraph->getErrorRate()); // Get the root vertex Vertex* pRootVertex = pGraph->getVertex(opt::rootID); if(pRootVertex == NULL) { std::cout << "Vertex " << opt::rootID << " not found in the graph.\n"; } else { copyVertexToSubgraph(pSubgraph, pRootVertex); pRootVertex->setColor(GC_BLACK); // Recursively add neighbors addNeighborsToSubgraph(pRootVertex, pSubgraph, opt::span); // Write the subgraph pSubgraph->writeASQG(opt::outFile); } delete pSubgraph; delete pGraph; }
int assemble() { StringGraph* pGraph; #pragma omp parallel { #pragma omp single nowait { std::cout << "\n[ Loading string graph: " << opt::asqgFile << " ]\n"; pGraph=SGUtil::loadASQGVertex(opt::asqgFile, opt::minOverlap, true, opt::maxEdges); } #pragma omp single nowait { std::cout << "[ Loading BWT ]\n"; opt::pBWT = new BWT(opt::prefix + BWT_EXT, BWT::DEFAULT_SAMPLE_RATE_SMALL); } #pragma omp single nowait { std::cout << "[ Loading RBWT ]\n"; opt::pRBWT = new BWT(opt::prefix + RBWT_EXT, BWT::DEFAULT_SAMPLE_RATE_SMALL); } #pragma omp single nowait { std::cout << "[ Loading SAI ]\n"; opt::pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI); } } opt::indices.pBWT = opt::pBWT; opt::indices.pRBWT = opt::pRBWT; opt::indices.pSSA = opt::pSSA; pGraph=SGUtil::loadASQGEdge(opt::asqgFile, opt::minOverlap, true, opt::maxEdges, pGraph); // if(opt::bExact) pGraph->setExactMode(opt::bExact); //pGraph->printMemSize(); // // Pre-assembly graph stats SGGraphStatsVisitor statsVisit; std::cout << "[Stats] Input graph:\n"; pGraph->visitP(statsVisit); int phase = 0 ; // Remove containments from the graph std::cout << "Removing contained vertices from graph\n"; SGContainRemoveVisitor containVisit; while(pGraph->hasContainment()) pGraph->visit(containVisit); /*---Remove Transitive Edges---*/ std::cout << "Removing transitive edges\n"; SGTransitiveReductionVisitor trVisit; pGraph->visit(trVisit); /*---Remove Transitive Edges---*/ // getchar(); // Compact together unbranched chains of vertices std::cout << "Start to simplify unipaths ...\n"; pGraph->simplify(); std::cout << "[Stats] Simplified graph:\n"; pGraph->visitP(statsVisit); /**********************Compute overlap raio and diff (for debug)************** std::ofstream ssol ("simpleOverlapLength.histo", std::ofstream::out); std::map<size_t,int> simpleStats = pGraph->getCountMap() ; //Stats simple overlap length std::cout << "< Compute Stats of Overlap Length and Ratios>" << std::endl; //ssol << "Length\tCount" << std::endl; for (std::map<size_t,int> ::iterator iter=simpleStats.begin(); iter!=simpleStats.end(); ++iter) ssol << iter->first << "\t" << iter->second << std::endl; ssol.close(); std::ofstream orStats ("overlapRaito.stats", std::ofstream::out); std::vector < overlapRatioInfo > overlapRatioStats = pGraph->getOverlapRatioStats(); for (std::vector < overlapRatioInfo >::iterator iter=overlapRatioStats.begin(); iter!=overlapRatioStats.end(); ++iter) orStats << (*iter).overlapLength << "\t" << (*iter).originalLength << "\t" << std::setiosflags(std::ios::fixed) << std::setprecision(3) << (double)(*iter).overlapLength/(*iter).originalLength << "\t" << (*iter).ratioDiff << std::endl; orStats.close(); **********************Compute overlap raio and diff***************************/ // /*** Remove Illegal Kmer Edges ***/ SGRemoveIllegalKmerEdgeVisitor ikeVisit(opt::pBWT,opt::kmerLength,opt::kmerThreshold, opt::credibleOverlapLength); std::cout << "\n[ Remove Illegal Kmer Edges due to kmerization ]" << std::endl; pGraph->visitP(ikeVisit); pGraph->simplify(); /*** trim dead end vertices from small to large***/ size_t trimLen = opt::kmerLength+1 , stepsize=opt::insertSize/5; while (trimLen < opt::insertSize *3/2) { SGTrimVisitor shortTrimVisit("",trimLen); std::cout << "[ Trimming short vertices (<" << trimLen << ") ]\n"; if(pGraph->visitP(shortTrimVisit)) pGraph->simplify(); trimLen=trimLen+stepsize; } /*** Pop Bubbles ***/ std::cout << "\n[ Remove bubbles and tips ]\n"; graphTrimAndSmooth (pGraph, opt::maxChimeraLength); // outputGraphAndFasta(pGraph,"popBubbles",++phase); /*** Remove small chimeric vertices ***/ std::cout << "\n[ Remove small chimera vertices ]\n"; for (size_t threshold=2; threshold<=opt::kmerThreshold; threshold++) RemoveVertexWithBothShortEdges (pGraph, opt::readLength, opt::credibleOverlapLength, opt::pBWT, opt::kmerLength, threshold); //remove edges with overlap <= 1st overlap length peak (from unmerged original reads) RemoveVertexWithBothShortEdges (pGraph, opt::readLength, pGraph->getMinOverlap()); RemoveVertexWithBothShortEdges (pGraph, opt::readLength, opt::credibleOverlapLength); RemoveVertexWithBothShortEdges (pGraph, opt::insertSize, opt::credibleOverlapLength); RemoveVertexWithBothShortEdges (pGraph, opt::maxChimeraLength, opt::credibleOverlapLength); pGraph->contigStats(); pGraph->visit(statsVisit); outputGraphAndFasta(pGraph,"",++phase); /*** remove edges from 1st peak to 2nd peak in the overlap length distribution 1st peak: opt::credibleOverlapLength , 2nd peak: opt::insertSize*opt::minOverlapRatio Define large vertex as opt::maxChimeraLength*4 be very careful and little by little for avoiding misassembly ***/ std::cout << "\n[ Remove chimeric edges with insufficient overlap or large-overlap difference from large vertices]\n"; size_t stepsize2 = (opt::insertSize*opt::minOverlapRatio-opt::credibleOverlapLength)/4; //iterate 4 times for (size_t len = opt::credibleOverlapLength ; len <= opt::insertSize*opt::minOverlapRatio ; len+=stepsize2 ) { SGRemoveByOverlapLenDiffVisitor srv1(1600, len, (opt::insertSize*opt::minOverlapRatio)+opt::credibleOverlapLength-len); if(pGraph->visitP(srv1)) graphTrimAndSmooth(pGraph, opt::maxChimeraLength); } // fix min overlap length= insertsize*opt::minOverlapRatio, remove more edges by overlap diff from large to small diff // iterate 2 times for (size_t stepsize3 = opt::credibleOverlapLength/4; stepsize3 <= opt::credibleOverlapLength/2 ; stepsize3+=stepsize3 ) { SGRemoveByOverlapLenDiffVisitor srv1(1600, /*opt::insertSize*opt::minOverlapRatio*/ 0, opt::credibleOverlapLength-stepsize3); if(pGraph->visitP(srv1)) graphTrimAndSmooth(pGraph, opt::maxChimeraLength); } // SGRemoveByOverlapLenDiffVisitor srv1(opt::maxChimeraLength*3, /*opt::insertSize*opt::minOverlapRatio*/ 0, opt::credibleOverlapLength/3); // if(pGraph->visitP(srv1)) // graphTrimAndSmooth(pGraph, opt::maxChimeraLength); RemoveVertexWithBothShortEdges (pGraph, opt::readLength+100, opt::readLength*0.9); pGraph->contigStats(); pGraph->visit(statsVisit); outputGraphAndFasta(pGraph,"",++phase); /*** Remove edges according to PE links, which remove small repeat vertices most of the time ***/ for (size_t minPELink = 1; minPELink <= 1; minPELink++ ) { // 51 is more accurate while 31 produces longer contigs SGRemoveEdgeByPEVisitor REPEVistit(opt::indices, opt::insertSize, 51, minPELink); if(pGraph->visitP(REPEVistit)) graphTrimAndSmooth(pGraph, opt::maxChimeraLength); } pGraph->contigStats(); pGraph->visit(statsVisit); outputGraphAndFasta(pGraph,"",++phase); /** Incrementally remove chimera edges from small to large chimeric vertices using overlap ratio Be very careful and little by little for avoiding misassembly This visitor resolves chimeric with size roughly equal to read length+100 (first peak) Resolve larger vertices lead to misassembly ***/ std::cout << "\n[ Remove chimeric edges from small vertices using overlap ratios ]\n"; size_t stepsize4 = 15; for (size_t len = opt::readLength ; len <= opt::readLength+100 ; len+=stepsize4 ) RemoveSmallOverlapRatioEdges ( pGraph, len); // Rename requires extra memory which should // only be done after the string graph has been greatly simplified. pGraph->renameVertices(""); /******* Re-join broken islands/tips due to high-GC errors ********/ size_t min_size_of_islandtip=opt::maxChimeraLength; /***************** 1. Trim bad ends of island/tip *****************/ SGFastaErosionVisitor eFAVisit (opt::pBWT, opt::kmerLength, opt::kmerThreshold, min_size_of_islandtip); pGraph->visitP(eFAVisit); pGraph->contigStats(); pGraph->visitP(statsVisit); outputGraphAndFasta(pGraph,"", ++phase); /*** 2. Collect read IDs mapped to large island/tip with size > min_size_of_islandtip ***/ ThreadSafeListVector tslv; tslv.resize(opt::pSSA->getNumberOfReads()); SGIslandCollectVisitor sgicv(&tslv, opt::indices, opt::insertSize, 51, min_size_of_islandtip); pGraph->visitP(sgicv); /*** 3. Join islands/tips with PE support using FM-index walk (depth,leaves,minoverlap)=(150, 2000, 19) ***/ SGJoinIslandVisitor sgjiv(100, 4000, opt::kmerLength/2+4, min_size_of_islandtip, &tslv, opt::indices, 3); pGraph->visitProgress(sgjiv); graphTrimAndSmooth (pGraph, opt::maxChimeraLength); std::cout << "\n[Stats] Final graph statistic:\n"; pGraph->contigStats(); pGraph->visitP(statsVisit); // Write the results std::cout << "\n<Printing the contig file> : " << opt::outContigsFile << " \n" << std::endl; SGFastaVisitor av(opt::outContigsFile); pGraph->visit(av); //std::cout << "<Printing the ASQG and dot files>\n" << std::endl; pGraph->writeASQG(opt::outGraphFile); pGraph->writeDot("StriDe-graph.dot",0); delete pGraph; return 0; }
void assemble() { Timer t("sga assemble"); StringGraph* pGraph = SGUtil::loadASQG(opt::asqgFile, opt::minOverlap, true); if(opt::bExact) pGraph->setExactMode(true); pGraph->printMemSize(); // Visitor functors SGTransitiveReductionVisitor trVisit; SGGraphStatsVisitor statsVisit; SGRemodelVisitor remodelVisit; SGEdgeStatsVisitor edgeStatsVisit; SGTrimVisitor trimVisit(opt::trimLengthThreshold); SGBubbleVisitor bubbleVisit; SGBubbleEdgeVisitor bubbleEdgeVisit; SGContainRemoveVisitor containVisit; SGValidateStructureVisitor validationVisit; // Pre-assembly graph stats std::cout << "[Stats] Input graph:\n"; pGraph->visit(statsVisit); // Remove containments from the graph std::cout << "Removing contained vertices from graph\n"; while(pGraph->hasContainment()) pGraph->visit(containVisit); // Pre-assembly graph stats std::cout << "[Stats] After removing contained vertices:\n"; pGraph->visit(statsVisit); // Remove any extraneous transitive edges that may remain in the graph if(opt::bPerformTR) { std::cout << "Removing transitive edges\n"; pGraph->visit(trVisit); } // Compact together unbranched chains of vertices pGraph->simplify(); if(opt::bValidate) { std::cout << "Validating graph structure\n"; pGraph->visit(validationVisit); } // Remove dead-end branches from the graph if(opt::numTrimRounds > 0) { std::cout << "Trimming bad vertices\n"; int numTrims = opt::numTrimRounds; while(numTrims-- > 0) pGraph->visit(trimVisit); std::cout << "\n[Stats] Graph after trimming:\n"; pGraph->visit(statsVisit); } // Resolve small repeats if(opt::resolveSmallRepeatLen > 0) { SGSmallRepeatResolveVisitor smallRepeatVisit(opt::resolveSmallRepeatLen); std::cout << "Resolving small repeats\n"; int totalSmallRepeatRounds = 0; while(pGraph->visit(smallRepeatVisit)) std::cout << "Finished small repeat resolve round " << totalSmallRepeatRounds++ << "\n"; std::cout << "\n[Stats] After small repeat resolution:\n"; pGraph->visit(statsVisit); } // if(opt::coverageCutoff > 0) { std::cout << "Coverage visit\n"; SGCoverageVisitor coverageVisit(opt::coverageCutoff); pGraph->visit(coverageVisit); pGraph->visit(trimVisit); pGraph->visit(trimVisit); pGraph->visit(trimVisit); } // Peform another round of simplification pGraph->simplify(); if(opt::numBubbleRounds > 0) { std::cout << "\nPerforming variation smoothing\n"; SGSmoothingVisitor smoothingVisit(opt::outVariantsFile, opt::maxBubbleGapDivergence, opt::maxBubbleDivergence, opt::maxIndelLength); int numSmooth = opt::numBubbleRounds; while(numSmooth-- > 0) pGraph->visit(smoothingVisit); pGraph->simplify(); } pGraph->renameVertices("contig-"); std::cout << "\n[Stats] Final graph:\n"; pGraph->visit(statsVisit); // Rename the vertices to have contig IDs instead of read IDs //pGraph->renameVertices("contig-"); // Write the results SGFastaVisitor av(opt::outContigsFile); pGraph->visit(av); pGraph->writeASQG(opt::outGraphFile); delete pGraph; }